#Reading 5 years data from csv files and storing them in 5 different variables
data_2016 <- read.csv("/Users/hema/Downloads/patient_satisfaction/cms_hospital_patient_satisfaction_2016.csv")
data_2017 <- read.csv("/Users/hema/Downloads/patient_satisfaction/cms_hospital_patient_satisfaction_2017.csv")
data_2018 <- read.csv("/Users/hema/Downloads/patient_satisfaction/cms_hospital_patient_satisfaction_2018.csv")
data_2019 <- read.csv("/Users/hema/Downloads/patient_satisfaction/cms_hospital_patient_satisfaction_2019.csv")
data_2020 <- read.csv("/Users/hema/Downloads/patient_satisfaction/cms_hospital_patient_satisfaction_2020.csv")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Deleting unimportant columns from the dataset that ends with footnote
data_2016 <- data_2016 %>% select( -Phone.Number, -ends_with(".Footnote"))
# Deleting unimportant columns from the dataset that ends with footnote
data_2017 <- data_2017 %>% select( -Phone.Number, -ends_with(".Footnote"))
# Deleting unimportant columns from the dataset that ends with footnote
data_2018 <- data_2018 %>% select(-Phone.Number, -ends_with(".Footnote"))
# Deleting unimportant columns from the dataset that ends with footnote
data_2019 <- data_2019 %>% select( -Phone.Number, -ends_with(".Footnote"))
# Deleting unimportant columns from the dataset that ends with footnote
data_2020 <- data_2020 %>% select(-Phone.Number, -ends_with(".Footnote"))
# Combining the datasets into one
hosp <- rbind(data_2016,data_2017,data_2018,data_2019, data_2020)
head(hosp)
## Facility.ID Facility.Name Address City
## 1 10001 SOUTHEAST ALABAMA MEDICAL CENTER 1108 ROSS CLARK CIRCLE DOTHAN
## 2 10001 SOUTHEAST ALABAMA MEDICAL CENTER 1108 ROSS CLARK CIRCLE DOTHAN
## 3 10001 SOUTHEAST ALABAMA MEDICAL CENTER 1108 ROSS CLARK CIRCLE DOTHAN
## 4 10001 SOUTHEAST ALABAMA MEDICAL CENTER 1108 ROSS CLARK CIRCLE DOTHAN
## 5 10001 SOUTHEAST ALABAMA MEDICAL CENTER 1108 ROSS CLARK CIRCLE DOTHAN
## 6 10001 SOUTHEAST ALABAMA MEDICAL CENTER 1108 ROSS CLARK CIRCLE DOTHAN
## State ZIP.Code County.Name HCAHPS.Measure.ID
## 1 AL 36301 HOUSTON H_STAR_RATING
## 2 AL 36301 HOUSTON H_CLEAN_HSP_A_P
## 3 AL 36301 HOUSTON H_CLEAN_HSP_SN_P
## 4 AL 36301 HOUSTON H_CLEAN_HSP_U_P
## 5 AL 36301 HOUSTON H_CLEAN_LINEAR_SCORE
## 6 AL 36301 HOUSTON H_CLEAN_STAR_RATING
## HCAHPS.Question
## 1 Summary star rating
## 2 Patients who reported that their room and bathroom were "Always" clean
## 3 Patients who reported that their room and bathroom were "Sometimes" or "Never" clean
## 4 Patients who reported that their room and bathroom were "Usually" clean
## 5 Cleanliness - linear mean score
## 6 Cleanliness - star rating
## HCAHPS.Answer.Description Patient.Survey.Star.Rating
## 1 Summary star rating 3
## 2 Room was "always" clean Not Applicable
## 3 Room was "sometimes" or "never" clean Not Applicable
## 4 Room was "usually" clean Not Applicable
## 5 Cleanliness - linear mean score Not Applicable
## 6 Cleanliness - star rating 2
## HCAHPS.Answer.Percent HCAHPS.Linear.Mean.Value Number.of.Completed.Surveys
## 1 Not Applicable Not Applicable 1213
## 2 65 Not Applicable 1213
## 3 12 Not Applicable 1213
## 4 23 Not Applicable 1213
## 5 Not Applicable 84 1213
## 6 Not Applicable Not Applicable 1213
## Survey.Response.Rate.Percent Start.Date End.Date Year Hospital.Type
## 1 27 4/1/15 3/31/16 2016 Acute Care Hospitals
## 2 27 4/1/15 3/31/16 2016 Acute Care Hospitals
## 3 27 4/1/15 3/31/16 2016 Acute Care Hospitals
## 4 27 4/1/15 3/31/16 2016 Acute Care Hospitals
## 5 27 4/1/15 3/31/16 2016 Acute Care Hospitals
## 6 27 4/1/15 3/31/16 2016 Acute Care Hospitals
## Hospital.Ownership Emergency.Services
## 1 Government - Hospital District or Authority Yes
## 2 Government - Hospital District or Authority Yes
## 3 Government - Hospital District or Authority Yes
## 4 Government - Hospital District or Authority Yes
## 5 Government - Hospital District or Authority Yes
## 6 Government - Hospital District or Authority Yes
## Meets.criteria.for.promoting.interoperability.of.EHRs Hospital.overall.rating
## 1 Y 3
## 2 Y 3
## 3 Y 3
## 4 Y 3
## 5 Y 3
## 6 Y 3
## Mortality.national.comparison Safety.of.care.national.comparison
## 1 Same as the National average Above the National average
## 2 Same as the National average Above the National average
## 3 Same as the National average Above the National average
## 4 Same as the National average Above the National average
## 5 Same as the National average Above the National average
## 6 Same as the National average Above the National average
## Readmission.national.comparison Patient.experience.national.comparison
## 1 Same as the National average Below the National average
## 2 Same as the National average Below the National average
## 3 Same as the National average Below the National average
## 4 Same as the National average Below the National average
## 5 Same as the National average Below the National average
## 6 Same as the National average Below the National average
## Effectiveness.of.care.national.comparison
## 1 Same as the National average
## 2 Same as the National average
## 3 Same as the National average
## 4 Same as the National average
## 5 Same as the National average
## 6 Same as the National average
## Timeliness.of.care.national.comparison
## 1 Same as the National average
## 2 Same as the National average
## 3 Same as the National average
## 4 Same as the National average
## 5 Same as the National average
## 6 Same as the National average
## Efficient.use.of.medical.imaging.national.comparison
## 1 Same as the National average
## 2 Same as the National average
## 3 Same as the National average
## 4 Same as the National average
## 5 Same as the National average
## 6 Same as the National average
# Renaming columns
colnames(hosp)[colnames(hosp) == "Mortality.national.comparison"] <- "Mortality"
colnames(hosp)[colnames(hosp) == "Readmission.national.comparison"] <- "Readmission"
colnames(hosp)[colnames(hosp) == "Effectiveness.of.care.national.comparison"] <- "Effectiveness"
colnames(hosp)[colnames(hosp) == "Efficient.use.of.medical.imaging.national.comparison"] <- "Efficient.imaging"
colnames(hosp)[colnames(hosp) == "Patient.Survey.Star.Rating"] <- "Patient.Survey.Rate"
colnames(hosp)[colnames(hosp) == "Safety.of.care.national.comparison"] <- "Safety"
colnames(hosp)[colnames(hosp) == "Patient.experience.national.comparison"] <- "Patient.experience"
colnames(hosp)[colnames(hosp) == "Timeliness.of.care.national.comparison"] <- "Timeliness"
colnames(hosp)[colnames(hosp) == "HCAHPS.Question"] <- "Topics"
colnames(hosp)[colnames(hosp) == "Number.of.Completed.Surveys"] <- "No_Surveys"
colnames(hosp)[colnames(hosp) == "Hospital.overall.rating"] <- "Hosp.rating"
colnames(hosp)[colnames(hosp) == "Meets.criteria.for.promoting.interoperability.of.EHRs"] <- "EHR_criteria"
colnames(hosp)[colnames(hosp) == "HCAHPS.Linear.Mean.Value"] <- "Mean.score"
colnames(hosp)[colnames(hosp) == "HCAHPS.Answer.Percent"] <- "Ans.perc"
colnames(hosp)[colnames(hosp) == "Survey.Response.Rate.Percent"] <- "Response.rate.perc"
colnames(hosp)[colnames(hosp) == "HCAHPS.Answer.Description"] <- "Ans.desc"
colnames(hosp)[colnames(hosp) == "HCAHPS.Measure.ID"] <- "Measure.ID"
# Removing phone number var
hosp$Phone.Number <-NULL
hosp$ZIP.Code <-NULL
hosp$Start.Date <-NULL
hosp$End.Date <-NULL
hosp$Measure.ID <- NULL
# Extracting star ratings from patient survey data, discarding rows with 'Not Applicable' or 'Not Available' values
star_ratings <- hosp %>%
filter(`Patient.Survey.Rate` != 'Not Applicable' & `Patient.Survey.Rate` != 'Not Available')
# Extracting linear mean values from patient survey data, discarding rows with 'Not Applicable' or 'Not Available' values
LMVs <- hosp %>%
filter(`Mean.score` != 'Not Applicable' & `Mean.score` != 'Not Available')
#creating "summary_str_rate" df with only rows that has value Topics == 'Summary star rating'
summary_str_rate <- star_ratings[star_ratings$Topics == 'Summary star rating', ]
# Creating "hospitals_data" df with only specific columns
hospitals_data <- dplyr::select(summary_str_rate,
`Facility.ID`, `Facility.Name`, `Address`, `City`,
`State`, `County.Name`, `Year`, `Hospital.Type`, `Hospital.Ownership`,
`Emergency.Services`, `EHR_criteria`, `Hosp.rating`, `Mortality`,
`Safety`, `Readmission`, `Effectiveness`, `Timeliness`,
`Efficient.imaging`, `Patient.Survey.Rate`)
# checking if "Not Available" Values are present in "Hosp.rating"
any(hospitals_data$Hosp.rating %in% c('Not Available'))
## [1] TRUE
# Dropping rows where value is "Not Available" from the dataset
hospitals_data <- hospitals_data[!hospitals_data$Hosp.rating %in% c('Not Available'), ]
# converting Hosp,rating column to numeric
hospitals_data$Hosp.rating <- as.numeric(hospitals_data$Hosp.rating)
# Replace 'Y' with 1 and blanks with 0 in the 'EHR_criteria' column
hospitals_data$EHR_criteria <- ifelse(hospitals_data$EHR_criteria == 'Y', 1, ifelse(hospitals_data$EHR_criteria == '', 0, hospitals_data$EHR_criteria))
# Extracting the "Cleanliness star ratings" and merging with hospitals_data
cleanliness_star_Rate <- star_ratings %>%
filter(Topics == 'Cleanliness - star rating') %>%
select(Facility.ID, Year, Patient.Survey.Rate) %>%
rename(cleanliness_star_Rate_Rating = Patient.Survey.Rate)
hospitals_data <- merge(hospitals_data, cleanliness_star_Rate, by = c("Facility.ID", "Year"))
# Extracting "Nurse Communication star ratings" and merging with hospitals_data
nurse_commu_star_rate <- star_ratings %>%
filter(Topics == 'Nurse communication - star rating') %>%
select(`Facility.ID`, Year, `Patient.Survey.Rate`) %>%
rename(Nurse_Communication_Star_Rating = `Patient.Survey.Rate`)
hospitals_data <- merge(hospitals_data, nurse_commu_star_rate, by = c("Facility.ID", "Year"))
# Extracting "doctor communication star rating" and merging with hospitals_data
doc_commu_star_rate <- star_ratings %>%
filter(Topics == 'Doctor communication - star rating') %>%
select(`Facility.ID`, Year, `Patient.Survey.Rate`) %>%
rename(Doctor_Communication_Star_Rating = `Patient.Survey.Rate`)
hospitals_data <- merge(hospitals_data, doc_commu_star_rate, by = c("Facility.ID", "Year"))
# Extracting "staff response star rating" and merging with hospitals_data
staff_response_star_rate <- star_ratings %>%
filter(Topics == 'Staff responsiveness - star rating') %>%
select(`Facility.ID`, Year, `Patient.Survey.Rate`) %>%
rename(Staff_Responsiveness_Star_Rating = `Patient.Survey.Rate`)
hospitals_data <- merge(hospitals_data, staff_response_star_rate, by = c("Facility.ID", "Year"))
# Extracting "Pain Management star ratings" and merging with hospitals_data
pain_Mang_star_rate <- star_ratings %>%
filter(Topics == 'Pain management - star rating') %>%
select(`Facility.ID`, Year, `Patient.Survey.Rate`) %>%
rename(Pain_Management_Star_Rating = `Patient.Survey.Rate`)
hospitals_data <- merge(hospitals_data, pain_Mang_star_rate, by = c("Facility.ID", "Year"))
# Extracting "Communication About Medicines star ratings" and merging with hospitals_data
medicine_commu_star_rate <- star_ratings %>%
filter(Topics == 'Communication about medicines - star rating') %>%
select(`Facility.ID`, Year, `Patient.Survey.Rate`) %>%
rename(Communication_About_Medicine_Star_Rating = `Patient.Survey.Rate`)
hospitals_data <- merge(hospitals_data, medicine_commu_star_rate, by = c("Facility.ID", "Year"))
# Extracting "Discharge Information star ratings" and merging with hospitals_data
disch_info_sta_rate <- star_ratings %>%
filter(Topics == 'Discharge information - star rating') %>%
select(`Facility.ID`, Year, `Patient.Survey.Rate`) %>%
rename(Discharge_Info_Star_Rating = `Patient.Survey.Rate`)
hospitals_data <- merge(hospitals_data, disch_info_sta_rate, by = c("Facility.ID", "Year"))
# Extracting "Care Transition star ratings" and merging with hospitals_data
care_trans_star_rate <- star_ratings %>%
filter(Topics == 'Care transition - star rating') %>%
select(`Facility.ID`, Year, `Patient.Survey.Rate`) %>%
rename(Care_Transition_Star_Rating = `Patient.Survey.Rate`)
hospitals_data <- merge(hospitals_data, care_trans_star_rate, by = c("Facility.ID", "Year"))
# Extracting "Overall Hospital Rating star ratings" and merging with hospitals_data
hosp_rating_star_rate <- star_ratings %>%
filter(Topics == 'Overall hospital rating - star rating') %>%
select(`Facility.ID`, Year, `Patient.Survey.Rate`) %>%
rename(Hospital_Rating_Star_Rating = `Patient.Survey.Rate`)
hospitals_data <- merge(hospitals_data, hosp_rating_star_rate, by = c("Facility.ID", "Year"))
# Extracting "Quietness star ratings" and merging with hospitals_data
quietness_star_rate <- star_ratings %>%
filter(Topics == 'Quietness - star rating') %>%
select(`Facility.ID`, Year, `Patient.Survey.Rate`) %>%
rename(Quietness_Star_Rating = `Patient.Survey.Rate`)
hospitals_data <- merge(hospitals_data, quietness_star_rate, by = c("Facility.ID", "Year"))
# Extracting "Recommend hospital star ratings" and merging with hospitals_data
recommend_hosp_star_rate <- star_ratings %>%
filter(Topics == 'Recommend hospital - star rating') %>%
select(`Facility.ID`, Year, `Patient.Survey.Rate`) %>%
rename(Recommendation_Star_Rating = `Patient.Survey.Rate`)
hospitals_data <- merge(hospitals_data, recommend_hosp_star_rate, by = c("Facility.ID", "Year"))
#dim(hospitals_data)
#colnames(hospitals_data)
# Extracting "Cleanliness linear mean score" and merging with hospitals_data
cleanliness_LinMeaScore <- LMVs %>%
filter(Topics == 'Cleanliness - linear mean score') %>%
select(`Facility.ID`, Year, `Mean.score`) %>%
rename(cleanliness_LinMeaScore = `Mean.score`)
hospitals_data <- merge(hospitals_data, cleanliness_LinMeaScore, by = c("Facility.ID", "Year"))
# Extracting "Nurse Communication linear mean score" and merging with hospitals_data
nurse_comm_LinMeaScore <- LMVs %>%
filter(Topics == 'Nurse communication - linear mean score') %>%
select(`Facility.ID`, Year, `Mean.score`) %>%
rename(nurse_comm_LinMeaScore = `Mean.score`)
hospitals_data <- merge(hospitals_data, nurse_comm_LinMeaScore, by = c("Facility.ID", "Year"))
# Extracting "Doctor Communication linear mean score" and merging with hospitals_data
doc_comm_LinMeaScore <- LMVs %>%
filter(Topics == 'Doctor communication - linear mean score') %>%
select(`Facility.ID`, Year, `Mean.score`) %>%
rename(doc_comm_LinMeaScore = `Mean.score`)
hospitals_data <- merge(hospitals_data, doc_comm_LinMeaScore, by = c("Facility.ID", "Year"))
# Extracting "Staff Responsiveness linear mean score" and merging with hospitals_data
staff_response_LinMeaScore <- LMVs %>%
filter(Topics == 'Staff responsiveness - linear mean score') %>%
select(`Facility.ID`, Year, `Mean.score`) %>%
rename(staff_response_LinMeaScore = `Mean.score`)
# Merge the staff_response_LinMeaScore data with hospitals_data
hospitals_data <- merge(hospitals_data, staff_response_LinMeaScore, by = c("Facility.ID", "Year"))
# Display dimensions of the resulting hospitals_data
# Extracting "Pain Management linear mean score" and merging with hospitals_data
pain_mang_LinMeaScore <- LMVs %>%
filter(Topics == 'Pain management - linear mean score') %>%
select(`Facility.ID`, Year, `Mean.score`) %>%
rename(pain_mang_LinMeaScore = `Mean.score`)
hospitals_data <- merge(hospitals_data, pain_mang_LinMeaScore, by = c("Facility.ID", "Year"))
# Extracting "Communication About Medicines linear mean score" and merging with hospitals_data
med_commu_LinMeaScore <- LMVs %>%
filter(Topics == 'Communication about medicines - linear mean score') %>%
select(`Facility.ID`, Year, `Mean.score`) %>%
rename(med_commu_LinMeaScore = `Mean.score`)
hospitals_data <- merge(hospitals_data, med_commu_LinMeaScore, by = c("Facility.ID", "Year"))
# Extracting "Discharge Information linear mean score" and merging with hospitals_data
discharge_info_LinMeaScore <- LMVs %>%
filter(Topics == 'Discharge information - linear mean score') %>%
select(`Facility.ID`, Year, `Mean.score`) %>%
rename(discharge_info_LinMeaScore = `Mean.score`)
hospitals_data <- merge(hospitals_data, discharge_info_LinMeaScore, by = c("Facility.ID", "Year"))
# Extracting "Care Transition linear mean score" and merging with hospitals_data
care_trans_LinMeaScore <- LMVs %>%
filter(Topics == 'Care transition - linear mean score') %>%
select(`Facility.ID`, Year, `Mean.score`) %>%
rename(care_trans_LinMeaScore = `Mean.score`)
hospitals_data <- merge(hospitals_data, care_trans_LinMeaScore, by = c("Facility.ID", "Year"))
# Extracting "Overall Hospital Rating linear mean score" and merging with hospitals_data
hosp_rate_LinMeaScore <- LMVs %>%
filter(Topics == 'Overall hospital rating - linear mean score') %>%
select(`Facility.ID`, Year, `Mean.score`) %>%
rename(hosp_rate_LinMeaScore = `Mean.score`)
hospitals_data <- merge(hospitals_data, hosp_rate_LinMeaScore, by = c("Facility.ID", "Year"))
# Extracting "Quietness linear mean score" and merging with hospitals_data
quietness_LinMeaScore <- LMVs %>%
filter(Topics == 'Quietness - linear mean score') %>%
select(`Facility.ID`, Year, `Mean.score`) %>%
rename(quietness_LinMeaScore = `Mean.score`)
hospitals_data <- merge(hospitals_data, quietness_LinMeaScore, by = c("Facility.ID", "Year"))
# Extracting "Recommendation linear mean score" and merging with hospitals_data
recommend_LinMeaScore <- LMVs %>%
filter(Topics == 'Recommend hospital - linear mean score') %>%
select(`Facility.ID`, Year, `Mean.score`) %>%
rename(recommend_LinMeaScore = `Mean.score`)
hospitals_data <- merge(hospitals_data, recommend_LinMeaScore, by = c("Facility.ID", "Year"))
#dim(hospitals_data)
#str(hospitals_data)
library(dplyr)
# Removing "not Available" and converting 'National' to 'national' for specified columns
variables_to_clean <- c("Mortality", "Safety", "Readmission", "Timeliness", "Efficient.imaging", "Effectiveness")
for (variable in variables_to_clean) {
# Converting 'National' to 'national'
hospitals_data[[variable]] <- gsub("\\bNational\\b", "national", hospitals_data[[variable]], ignore.case = TRUE)
# Removing rows with "not Available"
hospitals_data <- hospitals_data[!hospitals_data[[variable]] %in% "Not Available", ]
}
# Checking unique values after cleaning
for (variable in variables_to_clean) {
print(unique(hospitals_data[[variable]]))
}
## [1] "Same as the national average" "Above the national average"
## [3] "Below the national average"
## [1] "Below the national average" "Above the national average"
## [3] "Same as the national average"
## [1] "Below the national average" "Above the national average"
## [3] "Same as the national average"
## [1] "Below the national average" "Same as the national average"
## [3] "Above the national average"
## [1] "Below the national average" "Above the national average"
## [3] "Same as the national average"
## [1] "Same as the national average" "Above the national average"
## [3] "Below the national average"
# Converting specific columns to numeric in hospitals_data
numeric_columns <- 19:41
for (col_index in numeric_columns) {
hospitals_data[, col_index] <- as.numeric(unlist(hospitals_data[, col_index]))
}
# Creating binary variables for hospital rating variable
hospitals_data$hosp_rate <- ifelse(hospitals_data$Hosp.rating >=4, 1, 0)
# Converting star rating variables to factors
hospitals_data$hosp_rate <- factor(hospitals_data$hosp_rate)
#checking levels
levels(hospitals_data$hosp_rate)
## [1] "0" "1"
#creating duplicate dataframe of hospitals_data and converting Hosp.rating to factor
hospitals_data1 <- hospitals_data
hospitals_data1$Hosp.rating <- as.factor(hospitals_data1$Hosp.rating)
# creating a constant column 'n' with value 1 to hospitals_data for potential counting or summarization.
hospitals_data$n = 1
# Grouping the hospital data by Facility.Name, Facility.ID, calculating the mean for different ratings and survey responses.
hospitals_data_agg <- hospitals_data %>%
group_by(Facility.Name, Facility.ID) %>%
summarize(Hosp.rating=mean(Hosp.rating),
Patient.Survey.Rate=mean(Patient.Survey.Rate),
cleanliness_star_Rate_Rating=mean(cleanliness_star_Rate_Rating),
Nurse_Communication_Star_Rating=mean(Nurse_Communication_Star_Rating),
Doctor_Communication_Star_Rating=mean(Doctor_Communication_Star_Rating),
Staff_Responsiveness_Star_Rating=mean(Staff_Responsiveness_Star_Rating),
Communication_About_Medicine_Star_Rating=mean(Communication_About_Medicine_Star_Rating),
Discharge_Info_Star_Rating=mean(Discharge_Info_Star_Rating),
Care_Transition_Star_Rating=mean(Care_Transition_Star_Rating),
Hospital_Rating_Star_Rating=mean(Hospital_Rating_Star_Rating),
Quietness_Star_Rating=mean(Quietness_Star_Rating),
Recommendation_Star_Rating=mean(Recommendation_Star_Rating),
cleanliness_LinMeaScore=mean(cleanliness_LinMeaScore),
nurse_comm_LinMeaScore=mean(nurse_comm_LinMeaScore),
doc_comm_LinMeaScore=mean(doc_comm_LinMeaScore),
staff_response_LinMeaScore=mean(staff_response_LinMeaScore),
med_commu_LinMeaScore=mean( med_commu_LinMeaScore),
discharge_info_LinMeaScore=mean(discharge_info_LinMeaScore),
care_trans_LinMeaScore=mean(care_trans_LinMeaScore),
hosp_rate_LinMeaScore=mean(hosp_rate_LinMeaScore),
quietness_LinMeaScore=mean(quietness_LinMeaScore),
recommend_LinMeaScore=mean(recommend_LinMeaScore),
# Summing up the 'n' column and removing the group structure.
n=sum(n),
.groups = "drop")
# Displaying frequency tables for star rating variables
table(hospitals_data$Hosp.rating)
##
## 1 2 3 4 5
## 189 1040 2104 1156 118
table(hospitals_data$Patient.Survey.Rate)
##
## 1 2 3 4 5
## 45 859 2238 1434 31
table(hospitals_data$cleanliness_star_Rate_Rating)
##
## 1 2 3 4 5
## 439 1727 1337 1018 86
table(hospitals_data$Nurse_Communication_Star_Rating)
##
## 1 2 3 4 5
## 34 824 1611 1918 220
table(hospitals_data$Doctor_Communication_Star_Rating)
##
## 1 2 3 4 5
## 96 1209 1987 1162 153
table(hospitals_data$Staff_Responsiveness_Star_Rating)
##
## 1 2 3 4 5
## 166 910 1757 1675 99
table(hospitals_data$Communication_About_Medicine_Star_Rating)
##
## 1 2 3 4 5
## 81 1597 2146 745 38
table(hospitals_data$Discharge_Info_Star_Rating)
##
## 1 2 3 4 5
## 116 703 1602 2013 173
table(hospitals_data$Care_Transition_Star_Rating)
##
## 1 2 3 4 5
## 209 1115 1580 1661 42
table(hospitals_data$Hospital_Rating_Star_Rating)
##
## 1 2 3 4 5
## 83 690 1725 1952 157
table(hospitals_data$Quietness_Star_Rating)
##
## 1 2 3 4 5
## 562 1002 1989 982 72
table(hospitals_data$Recommendation_Star_Rating)
##
## 1 2 3 4 5
## 243 895 1774 1522 173
#Displaying summary satistics for all Linear mean score variables
summary(hospitals_data$cleanliness_LinMeaScore)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 73.00 84.00 87.00 86.54 89.00 96.00
summary(hospitals_data$nurse_comm_LinMeaScore)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 74.00 90.00 91.00 91.08 92.00 96.00
summary(hospitals_data$doc_comm_LinMeaScore)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 81.00 90.00 92.00 91.41 93.00 98.00
summary(hospitals_data$staff_response_LinMeaScore)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 62.0 82.0 85.0 84.3 87.0 96.0
summary(hospitals_data$med_commu_LinMeaScore)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 58.00 76.00 78.00 77.98 80.00 89.00
summary(hospitals_data$discharge_info_LinMeaScore)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 67.00 85.00 87.00 86.98 89.00 95.00
summary(hospitals_data$care_trans_LinMeaScore)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 68.0 80.0 81.0 81.1 83.0 90.0
summary(hospitals_data$hosp_rate_LinMeaScore)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 71.00 87.00 89.00 88.45 90.00 98.00
summary(hospitals_data$quietness_LinMeaScore)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 65.00 79.00 82.00 82.01 85.00 93.00
summary(hospitals_data$recommend_LinMeaScore)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 61.00 86.00 88.00 87.85 91.00 99.00
#Hospital overall rating distribution
library(ggplot2)
# Creating a data frame for the counts
rating_counts <- table(hospitals_data$Hosp.rating)
rating_df <- data.frame(Rating = as.numeric(names(rating_counts)), Count = as.numeric(rating_counts))
# Plotting the barplot
ggplot(rating_df, aes(x = factor(Rating), y = Count)) +
geom_bar(stat = "identity", fill = "lightblue", color = "black") +
geom_text(aes(label = Count), vjust = -0.5, size = 3, color = "black") +
labs(title = "Distribution of Hospital Ratings", x = "Hospital Rating", y = "Frequency") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 0, hjust = 0.5, size = 10))
# Checkinh distribution of hospital rating variable after transformation to binary
library(ggplot2)
# Create a bar chart with values on top and custom x-axis labels
bar_chart <- ggplot(hospitals_data, aes(x = factor(hosp_rate), fill = hosp_rate)) +
geom_bar(stat = "count", color = "black") +
geom_text(
aes(label = ..count.., y = ..count..),
stat = "count",
position = position_stack(vjust = 0.5),
color = "black",
size = 3
) +
labs(title = "Distribution of Hospital Ratings",
x = "Experience",
y = "Count") +
scale_fill_manual(values = c("lightblue", "lightblue")) + # Set light blue color
theme_minimal() +
scale_x_discrete(labels = c("0 (Negative)", "1 (Positive)")) # Custom x-axis labels
# Display the bar chart with values on top and custom x-axis labels
print(bar_chart)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Distribution of hospital types
library(ggplot2)
# Create a barplot with ggplot2
ggplot(data = hospitals_data, aes(x = reorder(Hospital.Type, -table(Hospital.Type)[Hospital.Type]))) +
geom_bar(fill = "lightblue", color = "black") +
labs(title = "Distribution of Hospital Types", x = "Hospital Type", y = "Frequency") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
plot.title = element_text(hjust = 0.5, size = 14))
# Hospital ownership distribution
ggplot(data = hospitals_data, aes(y = reorder(Hospital.Ownership, -table(Hospital.Ownership)[Hospital.Ownership]))) +
geom_bar(fill = "lightblue", color = "black", stat = "count") +
geom_text(stat = 'count', aes(label = after_stat(count)), hjust = -0.0, size = 3, color = "black") +
labs(title = "Distribution of Hospital Types", y = "Hospital Ownership", x = "Frequency") +
theme_minimal() +
theme(axis.text.y = element_text(size = 8),
plot.title = element_text(hjust = 0.5, size = 14),
plot.margin = unit(c(0, 80, 0, 0), "points")) # Adjust left margin
# Distribution of star rating variables
hist(hospitals_data$Patient.Survey.Rate,
main = 'Distribution of Patient Survey Rating',
xlab = 'Patient Survey Rating',
ylab = 'Frequency')
hist(hospitals_data$cleanliness_star_Rate_Rating,
main = 'Distribution of Cleanliness Star Rating',
xlab = 'Cleanliness Star Rating',
ylab = 'Frequency')
hist(hospitals_data$Nurse_Communication_Star_Rating,
main = 'Distribution of Nurse Communication Star Rating',
xlab = 'Nurse Communication Star Rating',
ylab = 'Frequency')
hist(hospitals_data$Doctor_Communication_Star_Rating,
main = 'Distribution of Doctor Communication Star Rating',
xlab = 'Doctor Communication Star Rating',
ylab = 'Frequency')
hist(hospitals_data$Staff_Responsiveness_Star_Rating,
main = 'Distribution of Staff Responsiveness Star Rating',
xlab = 'Staff Responsiveness Star Rating',
ylab = 'Frequency')
hist(hospitals_data$Communication_About_Medicine_Star_Rating,
main = 'Distribution of Communication About Medicine Star Rating',
xlab = 'Communication About Medicine Star Rating',
ylab = 'Frequency')
hist(hospitals_data$Discharge_Info_Star_Rating,
main = 'Distribution of Discharge Information Star Rating',
xlab = 'Discharge Information Star Rating',
ylab = 'Frequency')
hist(hospitals_data$Care_Transition_Star_Rating,
main = 'Distribution of Care Transition Star Rating',
xlab = 'Care Transition Star Rating',
ylab = 'Frequency')
hist(hospitals_data$Hospital_Rating_Star_Rating,
main = 'Distribution of Hospital Rating Star Rating',
xlab = 'Hospital Rating Star Rating',
ylab = 'Frequency')
hist(hospitals_data$Quietness_Star_Rating,
main = 'Distribution of Quietness Star Rating',
xlab = 'Quietness Star Rating',
ylab = 'Frequency')
hist(hospitals_data$Recommendation_Star_Rating,
main = 'Distribution of Recommendation Star Rating',
xlab = 'Recommendation Star Rating',
ylab = 'Frequency')
#Distributions of the Linear Mean Scores
par(mfrow=c(2, 2))
hist(hospitals_data$cleanliness_LinMeaScore,
main = 'Cleanliness Linear Mean Score',
xlab = 'Cleanliness Linear Mean Score',
ylab = 'Frequency')
hist(hospitals_data$nurse_comm_LinMeaScore,
main = 'Nurse Communication Linear Mean Score',
xlab = 'Nurse Communication Linear Mean Score',
ylab = 'Frequency')
hist(hospitals_data$doc_comm_LinMeaScore,
main = 'Doctor Communication Linear Mean Score',
xlab = 'Doctor Communication Linear Mean Score',
ylab = 'Frequency')
hist(hospitals_data$staff_response_LinMeaScore,
main = 'Staff Responsiveness Linear Mean Score',
xlab = 'Staff Responsiveness Linear Mean Score',
ylab = 'Frequency')
hist(hospitals_data$med_commu_LinMeaScore,
main = 'Communication About Medicine Linear Mean Score',
xlab = 'Communication About Medicine Linear Mean Score',
ylab = 'Frequency')
hist(hospitals_data$discharge_info_LinMeaScore,
main = 'Discharge Information Linear Mean Score',
xlab = 'Discharge Information Linear Mean Score',
ylab = 'Frequency')
hist(hospitals_data$care_trans_LinMeaScore,
main = 'Care Transition Linear Mean Score',
xlab = 'Care Transition Linear Mean Score',
ylab = 'Frequency')
hist(hospitals_data$hosp_rate_LinMeaScore,
xlab = 'Overall Hospital Rating (LMV)',
main = 'Survey Response: Overall Hospital Rating (LMV)',
ylab = 'Frequency')
hist(hospitals_data$quietness_LinMeaScore,
main = 'Quietness Linear Mean Score',
xlab = 'Quietness Linear Mean Score',
ylab = 'Frequency')
hist(hospitals_data$recommend_LinMeaScore,
main = 'Recommendation Linear Mean Score',
xlab = 'Recommendation Linear Mean Score',
ylab = 'Frequency')
# Visualizing the relationship between different care aspects' linear mean scores and overall hospital rating
par(mfrow=c(2, 2))
plot(hospitals_data$cleanliness_LinMeaScore, hospitals_data$Hosp.rating,
main = "Cleanliness LM vs. Hospital Rating",
xlab = "Cleanliness LM",
ylab = "Hospital Rating")
plot(hospitals_data$nurse_comm_LinMeaScore, hospitals_data$Hosp.rating,
main = "Nurse Communication LM vs. Hospital Rating",
xlab = "Nurse Communication LM",
ylab = "Hospital Rating")
plot(hospitals_data$doc_comm_LinMeaScore, hospitals_data$Hosp.rating,
main = "Doctor Communication LM vs. Hospital Rating",
xlab = "Doctor Communication LM",
ylab = "Hospital Rating")
plot(hospitals_data$staff_response_LinMeaScore, hospitals_data$Hosp.rating,
main = "Staff Responsiveness LM vs. Hospital Rating",
xlab = "Staff Responsiveness LM",
ylab = "Hospital Rating")
plot(hospitals_data$med_commu_LinMeaScore, hospitals_data$Hosp.rating,
main = "Communication Abt Medicine LM vs. Hospital Rating",
xlab = "Communication Abt Medicine LM",
ylab = "Hospital Rating")
plot(hospitals_data$discharge_info_LinMeaScore, hospitals_data$Hosp.rating,
main = "Discharge Information LM vs. Hospital Rating",
xlab = "Discharge Information LM",
ylab = "Hospital Rating")
plot(hospitals_data$care_trans_LinMeaScore, hospitals_data$Hosp.rating,
main = "Care Transition LM vs. Hospital Rating",
xlab = "Care Transition LM",
ylab = "Hospital Rating")
plot(hospitals_data$hosp_rate_LinMeaScore, hospitals_data$Hosp.rating,
main = "Hospital Rating LM vs. Hospital Rating",
xlab = "Hospital Rating LM",
ylab = "Hospital Rating")
plot(hospitals_data$quietness_LinMeaScore, hospitals_data$Hosp.rating,
main = "Quietness LM vs. Hospital Rating",
xlab = "Quietness LM",
ylab = "Hospital Rating")
plot(hospitals_data$recommend_LinMeaScore, hospitals_data$Hosp.rating,
main = "Recommendation LM vs. Hospital Rating",
xlab = "Recommendation LM",
ylab = "Hospital Rating")
# Distribution of hospital performnace related variables
abbreviate_labels <- function(labels) {
abbreviations <- c("Above the national average" = "ANA",
"Same as the national average" = "SNA",
"Below the national average" = "BNA")
return(abbreviations[labels])
}
# Setting up a 2x4 grid for the bar plots
par(mfrow = c(2, 4))
create_barplot <- function(variable, main_title) {
table_data <- table(hospitals_data[[variable]])
abbreviated_labels <- abbreviate_labels(names(table_data))
barplot(table_data, col = "lightblue", border = "black", xlab = variable, ylab = "Frequency", main = main_title, names.arg = abbreviated_labels)
}
# Creating bar plots for each variable
create_barplot("Mortality", "Distribution of Mortality Values")
create_barplot("Safety", "Distribution of Safety Values")
create_barplot("Readmission", "Distribution of Readmission Values")
create_barplot("Effectiveness", "Distribution of Effectiveness Values")
create_barplot("Timeliness", "Distribution of Timeliness Values")
create_barplot("Efficient.imaging", "Distribution of Efficient Imaging Values")
# Resetting the plotting layout
par(mfrow = c(1, 1))
# #creating duplicate dataframe of hospitals_data and converting Hosp.rating to factor
hospitals_data1 <- hospitals_data
hospitals_data1$Hosp.rating <- as.factor(hospitals_data1$Hosp.rating)
# Converting the columns 22 to 33 to numeric values in the hospitals_data1 dataset
for (i in 22:33) {
hospitals_data1[,i] <- as.numeric(unlist(hospitals_data1[,i]))
}
# Plotting various star ratings against overall hospital rating for different aspects of care in hospitals_data1 dataset
plot(hospitals_data1$cleanliness_star_Rate_Rating, hospitals_data1$Hosp.rating)
plot(hospitals_data1$Nurse_Communication_Star_Rating, hospitals_data1$Hosp.rating)
plot(hospitals_data1$Doctor_Communication_Star_Rating, hospitals_data1$Hosp.rating)
plot(hospitals_data1$Staff_Responsiveness_Star_Rating, hospitals_data1$Hosp.rating)
plot(hospitals_data1$Communication_About_Medicine_Star_Rating, hospitals_data1$Hosp.rating)
plot(hospitals_data1$Discharge_Info_Star_Rating, hospitals_data1$Hosp.rating)
plot(hospitals_data1$Care_Transition_Star_Rating, hospitals_data1$Hosp.rating)
plot(hospitals_data1$Hospital_Rating_Star_Rating, hospitals_data1$Hosp.rating)
plot(hospitals_data1$Quietness_Star_Rating, hospitals_data1$Hosp.rating)
plot(hospitals_data1$Recommendation_Star_Rating, hospitals_data1$Hosp.rating)
# creating visulas to check relationship between variables
library(ggplot2)
# Creating jitter plots for each variable with hosp_rate
vars <- c("cleanliness_LinMeaScore", "nurse_comm_LinMeaScore", "doc_comm_LinMeaScore",
"staff_response_LinMeaScore", "med_commu_LinMeaScore", "discharge_info_LinMeaScore",
"care_trans_LinMeaScore", "quietness_LinMeaScore", "recommend_LinMeaScore")
# Looping through each variable and plot against hosp_rate
for(predictor in vars) {
plot_title <- paste(predictor, "vs. Hospital Rating", sep = " ")
plot <- ggplot(hospitals_data, aes(x = hosp_rate, y = !!sym(predictor))) +
geom_jitter() +
labs(title = plot_title, x = "Hospital Rating", y = predictor)
print(plot) # Display the plot
}
#Creating a subset of predictors from hospitals_data dataset for modeling
predictors <- hospitals_data[,31:41]
# Dropping pain management varable as it is available only for 2016 and 2017
predictors <- predictors[,-5]
summary(predictors)
## cleanliness_LinMeaScore nurse_comm_LinMeaScore doc_comm_LinMeaScore
## Min. :73.00 Min. :74.00 Min. :81.00
## 1st Qu.:84.00 1st Qu.:90.00 1st Qu.:90.00
## Median :87.00 Median :91.00 Median :92.00
## Mean :86.54 Mean :91.08 Mean :91.41
## 3rd Qu.:89.00 3rd Qu.:92.00 3rd Qu.:93.00
## Max. :96.00 Max. :96.00 Max. :98.00
## staff_response_LinMeaScore med_commu_LinMeaScore discharge_info_LinMeaScore
## Min. :62.0 Min. :58.00 Min. :67.00
## 1st Qu.:82.0 1st Qu.:76.00 1st Qu.:85.00
## Median :85.0 Median :78.00 Median :87.00
## Mean :84.3 Mean :77.98 Mean :86.98
## 3rd Qu.:87.0 3rd Qu.:80.00 3rd Qu.:89.00
## Max. :96.0 Max. :89.00 Max. :95.00
## care_trans_LinMeaScore hosp_rate_LinMeaScore quietness_LinMeaScore
## Min. :68.0 Min. :71.00 Min. :65.00
## 1st Qu.:80.0 1st Qu.:87.00 1st Qu.:79.00
## Median :81.0 Median :89.00 Median :82.00
## Mean :81.1 Mean :88.45 Mean :82.01
## 3rd Qu.:83.0 3rd Qu.:90.00 3rd Qu.:85.00
## Max. :90.0 Max. :98.00 Max. :93.00
## recommend_LinMeaScore
## Min. :61.00
## 1st Qu.:86.00
## Median :88.00
## Mean :87.85
## 3rd Qu.:91.00
## Max. :99.00
# heatmap to check correlation between the Linear Mean score variables
library(corrplot)
## corrplot 0.92 loaded
corre <- cor(predictors, method = c("pearson"))
gradient_colors <- colorRampPalette(c("darkblue", "white", "lightblue"))(100)
corrplot(corre, addCoef.col = 1, tl.cex = 0.5, number.cex = 0.35, col = gradient_colors)
# Relative weights analysis to check contribution of each variable towards "Hosp.rating"
library(rwa)
rwa <- hospitals_data %>%
rwa(outcome = "Hosp.rating",
predictors = c("cleanliness_LinMeaScore", "nurse_comm_LinMeaScore",
"doc_comm_LinMeaScore", "staff_response_LinMeaScore",
"med_commu_LinMeaScore", "discharge_info_LinMeaScore",
"care_trans_LinMeaScore", "quietness_LinMeaScore","recommend_LinMeaScore","Hospital_Rating_Star_Rating"),
applysigns = TRUE)
print(rwa)
## $predictors
## [1] "cleanliness_LinMeaScore" "nurse_comm_LinMeaScore"
## [3] "doc_comm_LinMeaScore" "staff_response_LinMeaScore"
## [5] "med_commu_LinMeaScore" "discharge_info_LinMeaScore"
## [7] "care_trans_LinMeaScore" "quietness_LinMeaScore"
## [9] "recommend_LinMeaScore" "Hospital_Rating_Star_Rating"
##
## $rsquare
## [1] 0.3835721
##
## $result
## Variables Raw.RelWeight Rescaled.RelWeight Sign
## 1 cleanliness_LinMeaScore 0.04101894 10.693933 +
## 2 nurse_comm_LinMeaScore 0.04017990 10.475188 +
## 3 doc_comm_LinMeaScore 0.02700153 7.039493 +
## 4 staff_response_LinMeaScore 0.04999606 13.034332 +
## 5 med_commu_LinMeaScore 0.03612499 9.418044 +
## 6 discharge_info_LinMeaScore 0.03509495 9.149505 +
## 7 care_trans_LinMeaScore 0.04492465 11.712179 +
## 8 quietness_LinMeaScore 0.01769028 4.611984 +
## 9 recommend_LinMeaScore 0.04828631 12.588588 +
## 10 Hospital_Rating_Star_Rating 0.04325448 11.276754 +
## Sign.Rescaled.RelWeight
## 1 10.693933
## 2 10.475188
## 3 7.039493
## 4 13.034332
## 5 9.418044
## 6 9.149505
## 7 11.712179
## 8 4.611984
## 9 12.588588
## 10 11.276754
##
## $n
## [1] 4607
##
## $lambda
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 0.86512826 0.2201907 0.09512423 0.2449880 0.1822323 0.11054543 0.20016974
## [2,] 0.22019073 0.6870029 0.23576252 0.3379212 0.2639048 0.20340905 0.28326488
## [3,] 0.09512423 0.2357625 0.81566652 0.1746580 0.2203030 0.15351917 0.21372018
## [4,] 0.24498798 0.3379212 0.17465802 0.7548650 0.2557501 0.20353992 0.21079626
## [5,] 0.18223234 0.2639048 0.22030297 0.2557501 0.7682281 0.22495591 0.21913075
## [6,] 0.11054543 0.2034090 0.15351917 0.2035399 0.2249559 0.85080349 0.22104931
## [7,] 0.20016974 0.2832649 0.21372018 0.2107963 0.2191308 0.22104931 0.72285171
## [8,] 0.10766871 0.1098364 0.21814292 0.1308192 0.1448300 0.04695235 0.09790591
## [9,] 0.12444413 0.2399708 0.17952601 0.1510819 0.1829167 0.16819640 0.31643660
## [10,] 0.14637513 0.2210392 0.20483248 0.1850744 0.1999301 0.16606849 0.24590310
## [,8] [,9] [,10]
## [1,] 0.10766871 0.12444413 0.1463751
## [2,] 0.10983644 0.23997081 0.2210392
## [3,] 0.21814292 0.17952601 0.2048325
## [4,] 0.13081920 0.15108187 0.1850744
## [5,] 0.14482997 0.18291668 0.1999301
## [6,] 0.04695235 0.16819640 0.1660685
## [7,] 0.09790591 0.31643660 0.2459031
## [8,] 0.91432163 0.09987588 0.1814342
## [9,] 0.09987588 0.74825171 0.3743473
## [10,] 0.18143419 0.37434725 0.7432589
##
## $RXX
## cleanliness_LinMeaScore nurse_comm_LinMeaScore
## cleanliness_LinMeaScore 1.0000000 0.6483008
## nurse_comm_LinMeaScore 0.6483008 1.0000000
## doc_comm_LinMeaScore 0.4302941 0.6964619
## staff_response_LinMeaScore 0.6591782 0.8424990
## med_commu_LinMeaScore 0.5757241 0.7743556
## discharge_info_LinMeaScore 0.4344817 0.6463287
## care_trans_LinMeaScore 0.6024932 0.8088941
## quietness_LinMeaScore 0.3587431 0.4348045
## recommend_LinMeaScore 0.4885219 0.7310360
## Hospital_Rating_Star_Rating 0.5190585 0.7251654
## doc_comm_LinMeaScore staff_response_LinMeaScore
## cleanliness_LinMeaScore 0.4302941 0.6591782
## nurse_comm_LinMeaScore 0.6964619 0.8424990
## doc_comm_LinMeaScore 1.0000000 0.6034902
## staff_response_LinMeaScore 0.6034902 1.0000000
## med_commu_LinMeaScore 0.6599109 0.7373959
## discharge_info_LinMeaScore 0.5211121 0.6158671
## care_trans_LinMeaScore 0.6621994 0.7007467
## quietness_LinMeaScore 0.5515035 0.4358591
## recommend_LinMeaScore 0.6077785 0.6000949
## Hospital_Rating_Star_Rating 0.6465579 0.6406553
## med_commu_LinMeaScore discharge_info_LinMeaScore
## cleanliness_LinMeaScore 0.5757241 0.4344817
## nurse_comm_LinMeaScore 0.7743556 0.6463287
## doc_comm_LinMeaScore 0.6599109 0.5211121
## staff_response_LinMeaScore 0.7373959 0.6158671
## med_commu_LinMeaScore 1.0000000 0.6431192
## discharge_info_LinMeaScore 0.6431192 1.0000000
## care_trans_LinMeaScore 0.7099192 0.6512694
## quietness_LinMeaScore 0.4603648 0.2783885
## recommend_LinMeaScore 0.6380720 0.5677891
## Hospital_Rating_Star_Rating 0.6656518 0.5657965
## care_trans_LinMeaScore quietness_LinMeaScore
## cleanliness_LinMeaScore 0.6024932 0.3587431
## nurse_comm_LinMeaScore 0.8088941 0.4348045
## doc_comm_LinMeaScore 0.6621994 0.5515035
## staff_response_LinMeaScore 0.7007467 0.4358591
## med_commu_LinMeaScore 0.7099192 0.4603648
## discharge_info_LinMeaScore 0.6512694 0.2783885
## care_trans_LinMeaScore 1.0000000 0.4054866
## quietness_LinMeaScore 0.4054866 1.0000000
## recommend_LinMeaScore 0.8077060 0.3980234
## Hospital_Rating_Star_Rating 0.7519643 0.5078908
## recommend_LinMeaScore Hospital_Rating_Star_Rating
## cleanliness_LinMeaScore 0.4885219 0.5190585
## nurse_comm_LinMeaScore 0.7310360 0.7251654
## doc_comm_LinMeaScore 0.6077785 0.6465579
## staff_response_LinMeaScore 0.6000949 0.6406553
## med_commu_LinMeaScore 0.6380720 0.6656518
## discharge_info_LinMeaScore 0.5677891 0.5657965
## care_trans_LinMeaScore 0.8077060 0.7519643
## quietness_LinMeaScore 0.3980234 0.5078908
## recommend_LinMeaScore 1.0000000 0.8547718
## Hospital_Rating_Star_Rating 0.8547718 1.0000000
##
## $RXY
## cleanliness_LinMeaScore nurse_comm_LinMeaScore
## 0.4587206 0.5544341
## doc_comm_LinMeaScore staff_response_LinMeaScore
## 0.4501277 0.5426517
## med_commu_LinMeaScore discharge_info_LinMeaScore
## 0.5113304 0.4548825
## care_trans_LinMeaScore quietness_LinMeaScore
## 0.5524491 0.3313154
## recommend_LinMeaScore Hospital_Rating_Star_Rating
## 0.5315184 0.5317467
# # visual to see the relative weight analysis results
# library(ggplot2)
#
# # Bar plot
# ggplot(result_df, aes(x = reorder(Variables, Rescaled.RelWeight), y = Rescaled.RelWeight, fill = result$Sign.Rescaled.RelWeight)) +
# geom_bar(stat = "identity", position = "dodge", fill = "lightblue") + # Set fill color to lightblue
# geom_text(aes(label = ifelse(Variables != "cleanliness", round(Rescaled.RelWeight, 2), "")),
# position = position_dodge(width = 0.9), vjust = 0.5) + # Add labels for variables other than "cleanliness"
# labs(title = "Relative Weights of Predictors",
# x = "Predictors",
# y = "Rescaled Relative Weight",
# fill = "Sign") +
# theme_minimal() +
# theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
# coord_flip()
# Flipping the coordinates for better visibility
#splitting the data into 70% training and 30% test_dataing
set.seed(123)
split_index <- sample(1:nrow(hospitals_data), 0.7 * nrow(hospitals_data))
train_data <- hospitals_data[split_index, ]
test_data <- hospitals_data[-split_index, ]
# Running logistic regression on entire dataset "hospitals_data"
log_model <- glm(hosp_rate ~ cleanliness_LinMeaScore + nurse_comm_LinMeaScore +
doc_comm_LinMeaScore + staff_response_LinMeaScore +
med_commu_LinMeaScore + discharge_info_LinMeaScore + care_trans_LinMeaScore +
quietness_LinMeaScore + recommend_LinMeaScore + EHR_criteria + Emergency.Services + Mortality + Effectiveness + Timeliness + Safety + Readmission +Efficient.imaging + Hospital.Ownership , data = hospitals_data,
family='binomial'(link='logit'))
summary(log_model)
##
## Call:
## glm(formula = hosp_rate ~ cleanliness_LinMeaScore + nurse_comm_LinMeaScore +
## doc_comm_LinMeaScore + staff_response_LinMeaScore + med_commu_LinMeaScore +
## discharge_info_LinMeaScore + care_trans_LinMeaScore + quietness_LinMeaScore +
## recommend_LinMeaScore + EHR_criteria + Emergency.Services +
## Mortality + Effectiveness + Timeliness + Safety + Readmission +
## Efficient.imaging + Hospital.Ownership, family = binomial(link = "logit"),
## data = hospitals_data)
##
## Coefficients:
## Estimate
## (Intercept) -72.867485
## cleanliness_LinMeaScore 0.107453
## nurse_comm_LinMeaScore 0.180740
## doc_comm_LinMeaScore 0.054183
## staff_response_LinMeaScore 0.161807
## med_commu_LinMeaScore 0.023850
## discharge_info_LinMeaScore 0.050144
## care_trans_LinMeaScore 0.123236
## quietness_LinMeaScore -0.002282
## recommend_LinMeaScore 0.219240
## EHR_criteria1 -0.249933
## Emergency.ServicesYes -0.045605
## MortalityBelow the national average -5.845463
## MortalitySame as the national average -2.734958
## EffectivenessBelow the national average -0.853144
## EffectivenessSame as the national average -0.071803
## TimelinessBelow the national average -0.652733
## TimelinessSame as the national average -0.401245
## SafetyBelow the national average -4.675064
## SafetySame as the national average -2.470852
## ReadmissionBelow the national average -4.965368
## ReadmissionSame as the national average -2.394172
## Efficient.imagingBelow the national average -1.095964
## Efficient.imagingSame as the national average -0.138821
## Hospital.OwnershipGovernment - Hospital District or Authority -1.452272
## Hospital.OwnershipGovernment - Local -1.592429
## Hospital.OwnershipGovernment - State -1.324828
## Hospital.OwnershipPhysician -1.861661
## Hospital.OwnershipProprietary -1.182058
## Hospital.OwnershipVoluntary non-profit - Church -0.863946
## Hospital.OwnershipVoluntary non-profit - Other -1.099654
## Hospital.OwnershipVoluntary non-profit - Private -1.229179
## Std. Error
## (Intercept) 4.937881
## cleanliness_LinMeaScore 0.025538
## nurse_comm_LinMeaScore 0.072461
## doc_comm_LinMeaScore 0.049606
## staff_response_LinMeaScore 0.033890
## med_commu_LinMeaScore 0.030420
## discharge_info_LinMeaScore 0.028437
## care_trans_LinMeaScore 0.054532
## quietness_LinMeaScore 0.017338
## recommend_LinMeaScore 0.028100
## EHR_criteria1 0.401780
## Emergency.ServicesYes 0.452776
## MortalityBelow the national average 0.287306
## MortalitySame as the national average 0.180838
## EffectivenessBelow the national average 0.355249
## EffectivenessSame as the national average 0.228226
## TimelinessBelow the national average 0.177052
## TimelinessSame as the national average 0.137230
## SafetyBelow the national average 0.212811
## SafetySame as the national average 0.144976
## ReadmissionBelow the national average 0.221743
## ReadmissionSame as the national average 0.138130
## Efficient.imagingBelow the national average 0.222653
## Efficient.imagingSame as the national average 0.160237
## Hospital.OwnershipGovernment - Hospital District or Authority 0.873030
## Hospital.OwnershipGovernment - Local 0.888972
## Hospital.OwnershipGovernment - State 1.006780
## Hospital.OwnershipPhysician 1.144742
## Hospital.OwnershipProprietary 0.864486
## Hospital.OwnershipVoluntary non-profit - Church 0.865711
## Hospital.OwnershipVoluntary non-profit - Other 0.864264
## Hospital.OwnershipVoluntary non-profit - Private 0.852713
## z value Pr(>|z|)
## (Intercept) -14.757 < 2e-16
## cleanliness_LinMeaScore 4.208 2.58e-05
## nurse_comm_LinMeaScore 2.494 0.012620
## doc_comm_LinMeaScore 1.092 0.274711
## staff_response_LinMeaScore 4.774 1.80e-06
## med_commu_LinMeaScore 0.784 0.433023
## discharge_info_LinMeaScore 1.763 0.077838
## care_trans_LinMeaScore 2.260 0.023828
## quietness_LinMeaScore -0.132 0.895292
## recommend_LinMeaScore 7.802 6.09e-15
## EHR_criteria1 -0.622 0.533898
## Emergency.ServicesYes -0.101 0.919769
## MortalityBelow the national average -20.346 < 2e-16
## MortalitySame as the national average -15.124 < 2e-16
## EffectivenessBelow the national average -2.402 0.016326
## EffectivenessSame as the national average -0.315 0.753054
## TimelinessBelow the national average -3.687 0.000227
## TimelinessSame as the national average -2.924 0.003457
## SafetyBelow the national average -21.968 < 2e-16
## SafetySame as the national average -17.043 < 2e-16
## ReadmissionBelow the national average -22.392 < 2e-16
## ReadmissionSame as the national average -17.333 < 2e-16
## Efficient.imagingBelow the national average -4.922 8.55e-07
## Efficient.imagingSame as the national average -0.866 0.386301
## Hospital.OwnershipGovernment - Hospital District or Authority -1.663 0.096216
## Hospital.OwnershipGovernment - Local -1.791 0.073243
## Hospital.OwnershipGovernment - State -1.316 0.188206
## Hospital.OwnershipPhysician -1.626 0.103892
## Hospital.OwnershipProprietary -1.367 0.171514
## Hospital.OwnershipVoluntary non-profit - Church -0.998 0.318298
## Hospital.OwnershipVoluntary non-profit - Other -1.272 0.203246
## Hospital.OwnershipVoluntary non-profit - Private -1.441 0.149445
##
## (Intercept) ***
## cleanliness_LinMeaScore ***
## nurse_comm_LinMeaScore *
## doc_comm_LinMeaScore
## staff_response_LinMeaScore ***
## med_commu_LinMeaScore
## discharge_info_LinMeaScore .
## care_trans_LinMeaScore *
## quietness_LinMeaScore
## recommend_LinMeaScore ***
## EHR_criteria1
## Emergency.ServicesYes
## MortalityBelow the national average ***
## MortalitySame as the national average ***
## EffectivenessBelow the national average *
## EffectivenessSame as the national average
## TimelinessBelow the national average ***
## TimelinessSame as the national average **
## SafetyBelow the national average ***
## SafetySame as the national average ***
## ReadmissionBelow the national average ***
## ReadmissionSame as the national average ***
## Efficient.imagingBelow the national average ***
## Efficient.imagingSame as the national average
## Hospital.OwnershipGovernment - Hospital District or Authority .
## Hospital.OwnershipGovernment - Local .
## Hospital.OwnershipGovernment - State
## Hospital.OwnershipPhysician
## Hospital.OwnershipProprietary
## Hospital.OwnershipVoluntary non-profit - Church
## Hospital.OwnershipVoluntary non-profit - Other
## Hospital.OwnershipVoluntary non-profit - Private
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5433.0 on 4606 degrees of freedom
## Residual deviance: 2213.2 on 4575 degrees of freedom
## AIC: 2277.2
##
## Number of Fisher Scoring iterations: 7
####From the results we found that cleanliness, Nurse Communication, Staff Response, Recommendation, Mortality, Timeliness, Safety, Readmission, Efficient Imaging, and different categories of Hospital Ownership were significant factors.
####Positive coefficients of variables indicate for each unit increase in variable score, the log-odds of a higher hospital rating increase by that coefficient value. whereas negative coefficient indicate that hospitals with variable rates below the national average are associated with a substantial decrease in the log-odds of higher ratings.
# Performing stepwise variable selection for logistic regression
fwd_rate <- step(glm(formula = hosp_rate ~ 1, data = hospitals_data, family = binomial(link = 'logit')),
scope = formula(log_model), direction = 'forward')
## Start: AIC=5435.05
## hosp_rate ~ 1
##
## Df Deviance AIC
## + care_trans_LinMeaScore 1 4461.9 4465.9
## + recommend_LinMeaScore 1 4467.9 4471.9
## + Readmission 2 4496.7 4502.7
## + nurse_comm_LinMeaScore 1 4508.1 4512.1
## + staff_response_LinMeaScore 1 4671.3 4675.3
## + med_commu_LinMeaScore 1 4729.6 4733.6
## + cleanliness_LinMeaScore 1 4825.5 4829.5
## + Safety 2 4875.8 4881.8
## + discharge_info_LinMeaScore 1 4882.3 4886.3
## + doc_comm_LinMeaScore 1 4911.0 4915.0
## + Mortality 2 5193.2 5199.2
## + quietness_LinMeaScore 1 5225.3 5229.3
## + Timeliness 2 5302.7 5308.7
## + Effectiveness 2 5352.5 5358.5
## + Hospital.Ownership 8 5349.7 5367.7
## + Efficient.imaging 2 5415.0 5421.0
## <none> 5433.0 5435.0
## + EHR_criteria 1 5432.6 5436.6
## + Emergency.Services 1 5432.8 5436.8
##
## Step: AIC=4465.91
## hosp_rate ~ care_trans_LinMeaScore
##
## Df Deviance AIC
## + Readmission 2 3808.1 3816.1
## + Safety 2 3905.6 3913.6
## + Mortality 2 4225.0 4233.0
## + recommend_LinMeaScore 1 4338.2 4344.2
## + staff_response_LinMeaScore 1 4353.0 4359.0
## + nurse_comm_LinMeaScore 1 4356.9 4362.9
## + cleanliness_LinMeaScore 1 4383.7 4389.7
## + med_commu_LinMeaScore 1 4397.9 4403.9
## + discharge_info_LinMeaScore 1 4416.0 4422.0
## + Effectiveness 2 4430.7 4438.7
## + Timeliness 2 4431.4 4439.4
## + doc_comm_LinMeaScore 1 4437.3 4443.3
## + Efficient.imaging 2 4443.4 4451.4
## + quietness_LinMeaScore 1 4455.0 4461.0
## + Hospital.Ownership 8 4442.4 4462.4
## <none> 4461.9 4465.9
## + Emergency.Services 1 4461.6 4467.6
## + EHR_criteria 1 4461.9 4467.9
##
## Step: AIC=3816.12
## hosp_rate ~ care_trans_LinMeaScore + Readmission
##
## Df Deviance AIC
## + Safety 2 3167.7 3179.7
## + Mortality 2 3380.5 3392.5
## + nurse_comm_LinMeaScore 1 3676.5 3686.5
## + staff_response_LinMeaScore 1 3706.9 3716.9
## + recommend_LinMeaScore 1 3729.8 3739.8
## + cleanliness_LinMeaScore 1 3742.6 3752.6
## + med_commu_LinMeaScore 1 3753.7 3763.7
## + Effectiveness 2 3779.3 3791.3
## + Efficient.imaging 2 3788.2 3800.2
## + discharge_info_LinMeaScore 1 3791.6 3801.6
## + doc_comm_LinMeaScore 1 3794.2 3804.2
## + Timeliness 2 3794.3 3806.3
## + Hospital.Ownership 8 3787.7 3811.7
## + quietness_LinMeaScore 1 3802.5 3812.5
## <none> 3808.1 3816.1
## + Emergency.Services 1 3808.0 3818.0
## + EHR_criteria 1 3808.1 3818.1
##
## Step: AIC=3179.7
## hosp_rate ~ care_trans_LinMeaScore + Readmission + Safety
##
## Df Deviance AIC
## + Mortality 2 2570.8 2586.8
## + nurse_comm_LinMeaScore 1 3045.9 3059.9
## + staff_response_LinMeaScore 1 3065.0 3079.0
## + recommend_LinMeaScore 1 3072.5 3086.5
## + cleanliness_LinMeaScore 1 3098.6 3112.6
## + med_commu_LinMeaScore 1 3112.4 3126.4
## + Efficient.imaging 2 3137.6 3153.6
## + discharge_info_LinMeaScore 1 3141.8 3155.8
## + Effectiveness 2 3150.1 3166.1
## + doc_comm_LinMeaScore 1 3153.8 3167.8
## + Hospital.Ownership 8 3144.1 3172.1
## + Timeliness 2 3156.6 3172.6
## <none> 3167.7 3179.7
## + quietness_LinMeaScore 1 3166.5 3180.5
## + Emergency.Services 1 3167.6 3181.6
## + EHR_criteria 1 3167.7 3181.7
##
## Step: AIC=2586.84
## hosp_rate ~ care_trans_LinMeaScore + Readmission + Safety + Mortality
##
## Df Deviance AIC
## + staff_response_LinMeaScore 1 2409.9 2427.9
## + nurse_comm_LinMeaScore 1 2430.3 2448.3
## + recommend_LinMeaScore 1 2486.9 2504.9
## + cleanliness_LinMeaScore 1 2489.3 2507.3
## + med_commu_LinMeaScore 1 2498.7 2516.7
## + Timeliness 2 2525.9 2545.9
## + Efficient.imaging 2 2529.5 2549.5
## + discharge_info_LinMeaScore 1 2532.4 2550.4
## + doc_comm_LinMeaScore 1 2532.5 2550.5
## + quietness_LinMeaScore 1 2555.6 2573.6
## + Effectiveness 2 2559.6 2579.6
## <none> 2570.8 2586.8
## + Hospital.Ownership 8 2555.1 2587.1
## + Emergency.Services 1 2569.3 2587.3
## + EHR_criteria 1 2570.8 2588.8
##
## Step: AIC=2427.87
## hosp_rate ~ care_trans_LinMeaScore + Readmission + Safety + Mortality +
## staff_response_LinMeaScore
##
## Df Deviance AIC
## + recommend_LinMeaScore 1 2330.3 2350.3
## + Efficient.imaging 2 2367.5 2389.5
## + nurse_comm_LinMeaScore 1 2383.9 2403.9
## + cleanliness_LinMeaScore 1 2391.0 2411.0
## + discharge_info_LinMeaScore 1 2397.3 2417.3
## + Timeliness 2 2395.6 2417.6
## + med_commu_LinMeaScore 1 2399.5 2419.5
## + doc_comm_LinMeaScore 1 2400.4 2420.4
## + Effectiveness 2 2398.7 2420.7
## + quietness_LinMeaScore 1 2407.3 2427.3
## <none> 2409.9 2427.9
## + Hospital.Ownership 8 2394.0 2428.0
## + EHR_criteria 1 2409.7 2429.7
## + Emergency.Services 1 2409.8 2429.8
##
## Step: AIC=2350.29
## hosp_rate ~ care_trans_LinMeaScore + Readmission + Safety + Mortality +
## staff_response_LinMeaScore + recommend_LinMeaScore
##
## Df Deviance AIC
## + Efficient.imaging 2 2290.8 2314.8
## + cleanliness_LinMeaScore 1 2307.9 2329.9
## + Timeliness 2 2309.6 2333.6
## + nurse_comm_LinMeaScore 1 2316.6 2338.6
## + discharge_info_LinMeaScore 1 2321.2 2343.2
## + Effectiveness 2 2319.7 2343.7
## + med_commu_LinMeaScore 1 2324.4 2346.4
## + doc_comm_LinMeaScore 1 2326.9 2348.9
## <none> 2330.3 2350.3
## + Hospital.Ownership 8 2315.2 2351.2
## + EHR_criteria 1 2330.0 2352.0
## + quietness_LinMeaScore 1 2330.2 2352.2
## + Emergency.Services 1 2330.2 2352.2
##
## Step: AIC=2314.84
## hosp_rate ~ care_trans_LinMeaScore + Readmission + Safety + Mortality +
## staff_response_LinMeaScore + recommend_LinMeaScore + Efficient.imaging
##
## Df Deviance AIC
## + cleanliness_LinMeaScore 1 2269.7 2295.7
## + Timeliness 2 2270.2 2298.2
## + nurse_comm_LinMeaScore 1 2276.2 2302.2
## + med_commu_LinMeaScore 1 2283.9 2309.9
## + discharge_info_LinMeaScore 1 2284.4 2310.4
## + Effectiveness 2 2282.7 2310.7
## + doc_comm_LinMeaScore 1 2285.5 2311.5
## <none> 2290.8 2314.8
## + quietness_LinMeaScore 1 2289.3 2315.3
## + EHR_criteria 1 2290.7 2316.7
## + Emergency.Services 1 2290.8 2316.8
## + Hospital.Ownership 8 2280.0 2320.0
##
## Step: AIC=2295.67
## hosp_rate ~ care_trans_LinMeaScore + Readmission + Safety + Mortality +
## staff_response_LinMeaScore + recommend_LinMeaScore + Efficient.imaging +
## cleanliness_LinMeaScore
##
## Df Deviance AIC
## + Timeliness 2 2250.5 2280.5
## + nurse_comm_LinMeaScore 1 2259.1 2287.1
## + discharge_info_LinMeaScore 1 2261.9 2289.9
## + doc_comm_LinMeaScore 1 2262.8 2290.8
## + Effectiveness 2 2261.2 2291.2
## + med_commu_LinMeaScore 1 2264.1 2292.1
## <none> 2269.7 2295.7
## + quietness_LinMeaScore 1 2268.7 2296.7
## + EHR_criteria 1 2269.5 2297.5
## + Emergency.Services 1 2269.6 2297.6
## + Hospital.Ownership 8 2257.2 2299.2
##
## Step: AIC=2280.54
## hosp_rate ~ care_trans_LinMeaScore + Readmission + Safety + Mortality +
## staff_response_LinMeaScore + recommend_LinMeaScore + Efficient.imaging +
## cleanliness_LinMeaScore + Timeliness
##
## Df Deviance AIC
## + nurse_comm_LinMeaScore 1 2240.1 2272.1
## + discharge_info_LinMeaScore 1 2244.4 2276.4
## + Effectiveness 2 2242.7 2276.7
## + med_commu_LinMeaScore 1 2246.4 2278.4
## + doc_comm_LinMeaScore 1 2246.8 2278.8
## <none> 2250.5 2280.5
## + EHR_criteria 1 2250.4 2282.4
## + Emergency.Services 1 2250.5 2282.5
## + quietness_LinMeaScore 1 2250.5 2282.5
## + Hospital.Ownership 8 2238.2 2284.2
##
## Step: AIC=2272.11
## hosp_rate ~ care_trans_LinMeaScore + Readmission + Safety + Mortality +
## staff_response_LinMeaScore + recommend_LinMeaScore + Efficient.imaging +
## cleanliness_LinMeaScore + Timeliness + nurse_comm_LinMeaScore
##
## Df Deviance AIC
## + Effectiveness 2 2232.1 2268.1
## + discharge_info_LinMeaScore 1 2235.0 2269.0
## + med_commu_LinMeaScore 1 2238.0 2272.0
## <none> 2240.1 2272.1
## + doc_comm_LinMeaScore 1 2238.8 2272.8
## + EHR_criteria 1 2239.8 2273.8
## + Emergency.Services 1 2240.0 2274.0
## + quietness_LinMeaScore 1 2240.0 2274.0
## + Hospital.Ownership 8 2228.1 2276.1
##
## Step: AIC=2268.14
## hosp_rate ~ care_trans_LinMeaScore + Readmission + Safety + Mortality +
## staff_response_LinMeaScore + recommend_LinMeaScore + Efficient.imaging +
## cleanliness_LinMeaScore + Timeliness + nurse_comm_LinMeaScore +
## Effectiveness
##
## Df Deviance AIC
## + discharge_info_LinMeaScore 1 2226.9 2264.9
## + med_commu_LinMeaScore 1 2230.1 2268.1
## <none> 2232.1 2268.1
## + doc_comm_LinMeaScore 1 2230.7 2268.7
## + EHR_criteria 1 2231.8 2269.8
## + quietness_LinMeaScore 1 2232.1 2270.1
## + Emergency.Services 1 2232.1 2270.1
## + Hospital.Ownership 8 2220.2 2272.2
##
## Step: AIC=2264.93
## hosp_rate ~ care_trans_LinMeaScore + Readmission + Safety + Mortality +
## staff_response_LinMeaScore + recommend_LinMeaScore + Efficient.imaging +
## cleanliness_LinMeaScore + Timeliness + nurse_comm_LinMeaScore +
## Effectiveness + discharge_info_LinMeaScore
##
## Df Deviance AIC
## <none> 2226.9 2264.9
## + doc_comm_LinMeaScore 1 2225.7 2265.7
## + med_commu_LinMeaScore 1 2226.0 2266.0
## + EHR_criteria 1 2226.5 2266.5
## + Emergency.Services 1 2226.9 2266.9
## + quietness_LinMeaScore 1 2226.9 2266.9
## + Hospital.Ownership 8 2215.8 2269.8
bkwd_rate <- step(log_model, direction = 'backward')
## Start: AIC=2277.21
## hosp_rate ~ cleanliness_LinMeaScore + nurse_comm_LinMeaScore +
## doc_comm_LinMeaScore + staff_response_LinMeaScore + med_commu_LinMeaScore +
## discharge_info_LinMeaScore + care_trans_LinMeaScore + quietness_LinMeaScore +
## recommend_LinMeaScore + EHR_criteria + Emergency.Services +
## Mortality + Effectiveness + Timeliness + Safety + Readmission +
## Efficient.imaging + Hospital.Ownership
##
## Df Deviance AIC
## - Hospital.Ownership 8 2224.6 2272.6
## - Emergency.Services 1 2213.2 2275.2
## - quietness_LinMeaScore 1 2213.2 2275.2
## - EHR_criteria 1 2213.6 2275.6
## - med_commu_LinMeaScore 1 2213.8 2275.8
## - doc_comm_LinMeaScore 1 2214.4 2276.4
## <none> 2213.2 2277.2
## - discharge_info_LinMeaScore 1 2216.3 2278.3
## - care_trans_LinMeaScore 1 2218.3 2280.3
## - Effectiveness 2 2221.3 2281.3
## - nurse_comm_LinMeaScore 1 2219.5 2281.5
## - Timeliness 2 2227.7 2287.7
## - cleanliness_LinMeaScore 1 2231.3 2293.3
## - staff_response_LinMeaScore 1 2236.3 2298.3
## - Efficient.imaging 2 2245.4 2305.4
## - recommend_LinMeaScore 1 2277.5 2339.5
## - Mortality 2 2854.9 2914.9
## - Safety 2 3063.2 3123.2
## - Readmission 2 3184.0 3244.0
##
## Step: AIC=2272.57
## hosp_rate ~ cleanliness_LinMeaScore + nurse_comm_LinMeaScore +
## doc_comm_LinMeaScore + staff_response_LinMeaScore + med_commu_LinMeaScore +
## discharge_info_LinMeaScore + care_trans_LinMeaScore + quietness_LinMeaScore +
## recommend_LinMeaScore + EHR_criteria + Emergency.Services +
## Mortality + Effectiveness + Timeliness + Safety + Readmission +
## Efficient.imaging
##
## Df Deviance AIC
## - Emergency.Services 1 2224.6 2270.6
## - quietness_LinMeaScore 1 2224.7 2270.7
## - EHR_criteria 1 2225.0 2271.0
## - med_commu_LinMeaScore 1 2225.2 2271.2
## - doc_comm_LinMeaScore 1 2225.5 2271.5
## <none> 2224.6 2272.6
## - discharge_info_LinMeaScore 1 2228.3 2274.3
## - care_trans_LinMeaScore 1 2229.8 2275.8
## - Effectiveness 2 2232.7 2276.7
## - nurse_comm_LinMeaScore 1 2231.4 2277.4
## - Timeliness 2 2239.7 2283.7
## - cleanliness_LinMeaScore 1 2241.9 2287.9
## - staff_response_LinMeaScore 1 2246.7 2292.7
## - Efficient.imaging 2 2259.4 2303.4
## - recommend_LinMeaScore 1 2290.8 2336.8
## - Mortality 2 2869.8 2913.8
## - Safety 2 3074.6 3118.6
## - Readmission 2 3202.9 3246.9
##
## Step: AIC=2270.59
## hosp_rate ~ cleanliness_LinMeaScore + nurse_comm_LinMeaScore +
## doc_comm_LinMeaScore + staff_response_LinMeaScore + med_commu_LinMeaScore +
## discharge_info_LinMeaScore + care_trans_LinMeaScore + quietness_LinMeaScore +
## recommend_LinMeaScore + EHR_criteria + Mortality + Effectiveness +
## Timeliness + Safety + Readmission + Efficient.imaging
##
## Df Deviance AIC
## - quietness_LinMeaScore 1 2224.8 2268.8
## - EHR_criteria 1 2225.0 2269.0
## - med_commu_LinMeaScore 1 2225.2 2269.2
## - doc_comm_LinMeaScore 1 2225.6 2269.6
## <none> 2224.6 2270.6
## - discharge_info_LinMeaScore 1 2228.3 2272.3
## - care_trans_LinMeaScore 1 2229.8 2273.8
## - Effectiveness 2 2232.7 2274.7
## - nurse_comm_LinMeaScore 1 2231.4 2275.4
## - Timeliness 2 2239.8 2281.8
## - cleanliness_LinMeaScore 1 2241.9 2285.9
## - staff_response_LinMeaScore 1 2246.8 2290.8
## - Efficient.imaging 2 2259.4 2301.4
## - recommend_LinMeaScore 1 2290.8 2334.8
## - Mortality 2 2869.9 2911.9
## - Safety 2 3074.7 3116.7
## - Readmission 2 3203.8 3245.8
##
## Step: AIC=2268.76
## hosp_rate ~ cleanliness_LinMeaScore + nurse_comm_LinMeaScore +
## doc_comm_LinMeaScore + staff_response_LinMeaScore + med_commu_LinMeaScore +
## discharge_info_LinMeaScore + care_trans_LinMeaScore + recommend_LinMeaScore +
## EHR_criteria + Mortality + Effectiveness + Timeliness + Safety +
## Readmission + Efficient.imaging
##
## Df Deviance AIC
## - EHR_criteria 1 2225.2 2267.2
## - med_commu_LinMeaScore 1 2225.3 2267.3
## - doc_comm_LinMeaScore 1 2225.6 2267.6
## <none> 2224.8 2268.8
## - discharge_info_LinMeaScore 1 2228.8 2270.8
## - care_trans_LinMeaScore 1 2230.0 2272.0
## - Effectiveness 2 2232.9 2272.9
## - nurse_comm_LinMeaScore 1 2231.6 2273.6
## - Timeliness 2 2239.9 2279.9
## - cleanliness_LinMeaScore 1 2241.9 2283.9
## - staff_response_LinMeaScore 1 2246.9 2288.9
## - Efficient.imaging 2 2260.7 2300.7
## - recommend_LinMeaScore 1 2291.5 2333.5
## - Mortality 2 2875.0 2915.0
## - Safety 2 3074.7 3114.7
## - Readmission 2 3207.6 3247.6
##
## Step: AIC=2267.2
## hosp_rate ~ cleanliness_LinMeaScore + nurse_comm_LinMeaScore +
## doc_comm_LinMeaScore + staff_response_LinMeaScore + med_commu_LinMeaScore +
## discharge_info_LinMeaScore + care_trans_LinMeaScore + recommend_LinMeaScore +
## Mortality + Effectiveness + Timeliness + Safety + Readmission +
## Efficient.imaging
##
## Df Deviance AIC
## - med_commu_LinMeaScore 1 2225.7 2265.7
## - doc_comm_LinMeaScore 1 2226.0 2266.0
## <none> 2225.2 2267.2
## - discharge_info_LinMeaScore 1 2229.2 2269.2
## - care_trans_LinMeaScore 1 2230.5 2270.5
## - Effectiveness 2 2233.3 2271.3
## - nurse_comm_LinMeaScore 1 2232.0 2272.0
## - Timeliness 2 2240.3 2278.3
## - cleanliness_LinMeaScore 1 2242.4 2282.4
## - staff_response_LinMeaScore 1 2247.3 2287.3
## - Efficient.imaging 2 2261.3 2299.3
## - recommend_LinMeaScore 1 2291.9 2331.9
## - Mortality 2 2875.5 2913.5
## - Safety 2 3074.8 3112.8
## - Readmission 2 3208.1 3246.1
##
## Step: AIC=2265.74
## hosp_rate ~ cleanliness_LinMeaScore + nurse_comm_LinMeaScore +
## doc_comm_LinMeaScore + staff_response_LinMeaScore + discharge_info_LinMeaScore +
## care_trans_LinMeaScore + recommend_LinMeaScore + Mortality +
## Effectiveness + Timeliness + Safety + Readmission + Efficient.imaging
##
## Df Deviance AIC
## - doc_comm_LinMeaScore 1 2226.9 2264.9
## <none> 2225.7 2265.7
## - discharge_info_LinMeaScore 1 2230.7 2268.7
## - care_trans_LinMeaScore 1 2231.4 2269.4
## - Effectiveness 2 2233.9 2269.9
## - nurse_comm_LinMeaScore 1 2233.1 2271.1
## - Timeliness 2 2241.0 2277.0
## - cleanliness_LinMeaScore 1 2243.5 2281.5
## - staff_response_LinMeaScore 1 2250.0 2288.0
## - Efficient.imaging 2 2261.6 2297.6
## - recommend_LinMeaScore 1 2292.9 2330.9
## - Mortality 2 2877.6 2913.6
## - Safety 2 3075.7 3111.7
## - Readmission 2 3208.3 3244.3
##
## Step: AIC=2264.93
## hosp_rate ~ cleanliness_LinMeaScore + nurse_comm_LinMeaScore +
## staff_response_LinMeaScore + discharge_info_LinMeaScore +
## care_trans_LinMeaScore + recommend_LinMeaScore + Mortality +
## Effectiveness + Timeliness + Safety + Readmission + Efficient.imaging
##
## Df Deviance AIC
## <none> 2226.9 2264.9
## - discharge_info_LinMeaScore 1 2232.1 2268.1
## - Effectiveness 2 2235.0 2269.0
## - care_trans_LinMeaScore 1 2233.2 2269.2
## - nurse_comm_LinMeaScore 1 2236.5 2272.5
## - Timeliness 2 2244.0 2278.0
## - cleanliness_LinMeaScore 1 2243.9 2279.9
## - staff_response_LinMeaScore 1 2251.9 2287.9
## - Efficient.imaging 2 2262.0 2296.0
## - recommend_LinMeaScore 1 2297.1 2333.1
## - Mortality 2 2877.7 2911.7
## - Safety 2 3076.6 3110.6
## - Readmission 2 3211.7 3245.7
####Forward and backward selection process results suggests that cleanliness, nurse communication, staff communication, discharge information, care transitions, recommendation scores, mortality rates, and effectiveness, timeliness, safety, and readmission metrics are key factors influencing the overall hospital rating
# Running logistic regression on train dataset using only the suggested variables from forward backward selection
train_log <- glm(hosp_rate ~ care_trans_LinMeaScore + nurse_comm_LinMeaScore +
discharge_info_LinMeaScore + cleanliness_LinMeaScore + staff_response_LinMeaScore + Effectiveness + Timeliness + Efficient.imaging + recommend_LinMeaScore + Mortality + Safety + Readmission,
data = train_data,
family = binomial(link = 'logit'))
summary(train_log)
##
## Call:
## glm(formula = hosp_rate ~ care_trans_LinMeaScore + nurse_comm_LinMeaScore +
## discharge_info_LinMeaScore + cleanliness_LinMeaScore + staff_response_LinMeaScore +
## Effectiveness + Timeliness + Efficient.imaging + recommend_LinMeaScore +
## Mortality + Safety + Readmission, family = binomial(link = "logit"),
## data = train_data)
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -77.04426 5.19057 -14.843
## care_trans_LinMeaScore 0.10862 0.06399 1.697
## nurse_comm_LinMeaScore 0.21128 0.08345 2.532
## discharge_info_LinMeaScore 0.05562 0.03401 1.635
## cleanliness_LinMeaScore 0.12858 0.03106 4.139
## staff_response_LinMeaScore 0.22511 0.04070 5.531
## EffectivenessBelow the national average -0.88842 0.42395 -2.096
## EffectivenessSame as the national average -0.07129 0.28286 -0.252
## TimelinessBelow the national average -0.62953 0.20371 -3.090
## TimelinessSame as the national average -0.41958 0.16623 -2.524
## Efficient.imagingBelow the national average -1.32000 0.26591 -4.964
## Efficient.imagingSame as the national average -0.27260 0.19430 -1.403
## recommend_LinMeaScore 0.22151 0.03363 6.587
## MortalityBelow the national average -6.03024 0.35377 -17.046
## MortalitySame as the national average -2.57222 0.21556 -11.933
## SafetyBelow the national average -4.82202 0.26557 -18.157
## SafetySame as the national average -2.57611 0.17891 -14.399
## ReadmissionBelow the national average -5.04529 0.26819 -18.813
## ReadmissionSame as the national average -2.57192 0.17159 -14.989
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## care_trans_LinMeaScore 0.0896 .
## nurse_comm_LinMeaScore 0.0113 *
## discharge_info_LinMeaScore 0.1020
## cleanliness_LinMeaScore 3.48e-05 ***
## staff_response_LinMeaScore 3.18e-08 ***
## EffectivenessBelow the national average 0.0361 *
## EffectivenessSame as the national average 0.8010
## TimelinessBelow the national average 0.0020 **
## TimelinessSame as the national average 0.0116 *
## Efficient.imagingBelow the national average 6.91e-07 ***
## Efficient.imagingSame as the national average 0.1606
## recommend_LinMeaScore 4.50e-11 ***
## MortalityBelow the national average < 2e-16 ***
## MortalitySame as the national average < 2e-16 ***
## SafetyBelow the national average < 2e-16 ***
## SafetySame as the national average < 2e-16 ***
## ReadmissionBelow the national average < 2e-16 ***
## ReadmissionSame as the national average < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3785.5 on 3223 degrees of freedom
## Residual deviance: 1488.4 on 3205 degrees of freedom
## AIC: 1526.4
##
## Number of Fisher Scoring iterations: 7
summary(train_log)
##
## Call:
## glm(formula = hosp_rate ~ care_trans_LinMeaScore + nurse_comm_LinMeaScore +
## discharge_info_LinMeaScore + cleanliness_LinMeaScore + staff_response_LinMeaScore +
## Effectiveness + Timeliness + Efficient.imaging + recommend_LinMeaScore +
## Mortality + Safety + Readmission, family = binomial(link = "logit"),
## data = train_data)
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -77.04426 5.19057 -14.843
## care_trans_LinMeaScore 0.10862 0.06399 1.697
## nurse_comm_LinMeaScore 0.21128 0.08345 2.532
## discharge_info_LinMeaScore 0.05562 0.03401 1.635
## cleanliness_LinMeaScore 0.12858 0.03106 4.139
## staff_response_LinMeaScore 0.22511 0.04070 5.531
## EffectivenessBelow the national average -0.88842 0.42395 -2.096
## EffectivenessSame as the national average -0.07129 0.28286 -0.252
## TimelinessBelow the national average -0.62953 0.20371 -3.090
## TimelinessSame as the national average -0.41958 0.16623 -2.524
## Efficient.imagingBelow the national average -1.32000 0.26591 -4.964
## Efficient.imagingSame as the national average -0.27260 0.19430 -1.403
## recommend_LinMeaScore 0.22151 0.03363 6.587
## MortalityBelow the national average -6.03024 0.35377 -17.046
## MortalitySame as the national average -2.57222 0.21556 -11.933
## SafetyBelow the national average -4.82202 0.26557 -18.157
## SafetySame as the national average -2.57611 0.17891 -14.399
## ReadmissionBelow the national average -5.04529 0.26819 -18.813
## ReadmissionSame as the national average -2.57192 0.17159 -14.989
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## care_trans_LinMeaScore 0.0896 .
## nurse_comm_LinMeaScore 0.0113 *
## discharge_info_LinMeaScore 0.1020
## cleanliness_LinMeaScore 3.48e-05 ***
## staff_response_LinMeaScore 3.18e-08 ***
## EffectivenessBelow the national average 0.0361 *
## EffectivenessSame as the national average 0.8010
## TimelinessBelow the national average 0.0020 **
## TimelinessSame as the national average 0.0116 *
## Efficient.imagingBelow the national average 6.91e-07 ***
## Efficient.imagingSame as the national average 0.1606
## recommend_LinMeaScore 4.50e-11 ***
## MortalityBelow the national average < 2e-16 ***
## MortalitySame as the national average < 2e-16 ***
## SafetyBelow the national average < 2e-16 ***
## SafetySame as the national average < 2e-16 ***
## ReadmissionBelow the national average < 2e-16 ***
## ReadmissionSame as the national average < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3785.5 on 3223 degrees of freedom
## Residual deviance: 1488.4 on 3205 degrees of freedom
## AIC: 1526.4
##
## Number of Fisher Scoring iterations: 7
####From the results we found that cleanliness_LinMeaScore, staff_response_LinMeaScore, Efficient.imagingBelow the national averagerecommend_LinMeaScore, MortalityBelow the national average, SafetyBelow the national averag,ReadmissionBelow the national average are significant factors.
####Positive coefficients of variables indicate for each unit increase in variable score, the log-odds of a higher hospital rating increase by that coefficient value. whereas negative coefficient indicate that hospitals with variable rates below the national average are associated with a substantial decrease in the log-odds of higher ratings.
# Calculating and assigning predictions for hospital ratings using different logistic models
test_data$pred_hosp_rate <- predict(train_log, newdata = test_data, type = 'response')
test_data$pred_hosp_rate <- as.numeric(test_data$pred_hosp_rate)
# converting the predicted hospital rates in to binary factor
test_data$pred_hosp_rate <- as.factor(ifelse(test_data$pred_hosp_rate >= 0.5, "1", "0"))
test_data$hosp_rate <- as.factor(as.character(test_data$hosp_rate))
#checking the levles of both predicted and actual
levels(hospitals_data$hosp_rate)
## [1] "0" "1"
levels(test_data$pred_hosp_rate)
## [1] "0" "1"
#Confusion matrix for logistic regression
library(caret)
## Loading required package: lattice
conf_matrix_log <- confusionMatrix(test_data$hosp_rate, test_data$pred_hosp_rate)
print(conf_matrix_log)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 918 74
## 1 93 298
##
## Accuracy : 0.8792
## 95% CI : (0.8609, 0.896)
## No Information Rate : 0.731
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6978
##
## Mcnemar's Test P-Value : 0.1637
##
## Sensitivity : 0.9080
## Specificity : 0.8011
## Pos Pred Value : 0.9254
## Neg Pred Value : 0.7621
## Prevalence : 0.7310
## Detection Rate : 0.6638
## Detection Prevalence : 0.7173
## Balanced Accuracy : 0.8545
##
## 'Positive' Class : 0
##
####Results of the confusion matrix tells how well the model performed on new data.Overall Accuracy: The model accurately predicts hospital ratings 87.92% of the time.Sensitivity (0.9080) and Specificity (0.8011): values of sensitivity and specificity shows that the model is good at identifying both positive and negative cases.
# Extracting error metrics from the confusion matrix
accuracy_log <- conf_matrix_log$overall['Accuracy']
precision_log <- conf_matrix_log$byClass['Precision']
recall_log <- conf_matrix_log$byClass['Recall']
f1_score_log <- conf_matrix_log$byClass['F1']
print(precision_log)
## Precision
## 0.9254032
print(recall_log)
## Recall
## 0.9080119
print(f1_score_log)
## F1
## 0.9166251
# Converting factors to numeric to get error metrics
test_data$pred_hosp_rate <- as.numeric(levels(test_data$pred_hosp_rate))[test_data$pred_hosp_rate]
test_data$hosp_rate <- as.numeric(levels(test_data$hosp_rate))[test_data$hosp_rate]
# Calculate errors and error metrics
error_hosp_rate <- test_data$hosp_rate - test_data$pred_hosp_rate
mae_hosp_rate <- mean(abs(error_hosp_rate))
mse_hosp_rate <- mean(error_hosp_rate^2)
rmse_hosp_rate <- sqrt(mse_hosp_rate)
# Creating df for metrics
error_metrics <- data.frame(
Model = "hosp_rate",
MAE = mae_hosp_rate,
MSE = mse_hosp_rate,
RMSE = rmse_hosp_rate
)
# Printing the error metrics dataframe
print(error_metrics)
## Model MAE MSE RMSE
## 1 hosp_rate 0.120752 0.120752 0.3474939
####Lower MAE, MSE, and RMSE values are desirable, indicating smaller prediction errors.
# running cross validation for logistic regression model with 10 folds
library(caret)
set.seed(123)
# Creating training control with 10-fold cross-validation
ctrl_log <- trainControl(method = "cv", number = 10, summaryFunction = twoClassSummary, classProbs = TRUE)
# Ensuring that hosp_rate is a factor variable
train_data$hosp_rate <- as.factor(train_data$hosp_rate)
# Creating a new variable for levels
train_data$hosp_rate_levels <- train_data$hosp_rate
# Setting levels using make.names
levels(train_data$hosp_rate_levels) <- make.names(levels(train_data$hosp_rate))
# Training the logistic regression model with cross-validation
cv_log_results <- train(as.factor(hosp_rate_levels) ~ cleanliness_LinMeaScore + nurse_comm_LinMeaScore +
doc_comm_LinMeaScore + staff_response_LinMeaScore +
med_commu_LinMeaScore + discharge_info_LinMeaScore + care_trans_LinMeaScore +
quietness_LinMeaScore + recommend_LinMeaScore + EHR_criteria +
Emergency.Services + Mortality + Effectiveness + Timeliness + Safety + Readmission +
Efficient.imaging + Hospital.Ownership,
data = train_data, method = "glm", family = "binomial", trControl = ctrl_log)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
## in the result set. ROC will be used instead.
# Printing the results
print(cv_log_results)
## Generalized Linear Model
##
## 3224 samples
## 18 predictor
## 2 classes: 'X0', 'X1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2901, 2902, 2902, 2902, 2902, 2901, ...
## Resampling results:
##
## ROC Sens Spec
## 0.9546511 0.933788 0.7803243
####When 10-fold cross-validation was used to assess the logistic regression model, it produced a high ROC (Receiver Operating Characteristic) value of about 0.95. Suggesting it is better at differentiating between the two groups. The specificity, which indicates the accuracy in detecting negative situations, is roughly 78.03%, while the sensitivity, which measures the ability to accurately identify positive cases, is approximately 93.38%. These results imply that the model is promising for predicting hospital evaluations since it strikes a decent balance between accurately recognizing positive and negative incidents.
#Random Forest Technique
# Loading required libraries for random forest
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
# random forest
rf_model <- randomForest(as.factor(hosp_rate) ~ care_trans_LinMeaScore + nurse_comm_LinMeaScore+
discharge_info_LinMeaScore + cleanliness_LinMeaScore + staff_response_LinMeaScore+ Effectiveness + Timeliness + Efficient.imaging + recommend_LinMeaScore + Mortality + Safety + Readmission, data = train_data, ntree = 200,
mtry = 4, nodesize = 5, importance = TRUE,probability = TRUE)
## variable importance plot
importance_var <- varImpPlot(rf_model, type = 1)
importance_var
## MeanDecreaseAccuracy
## care_trans_LinMeaScore 23.203781
## nurse_comm_LinMeaScore 21.473396
## discharge_info_LinMeaScore 15.917174
## cleanliness_LinMeaScore 18.824919
## staff_response_LinMeaScore 28.635096
## Effectiveness 9.438475
## Timeliness 11.191392
## Efficient.imaging 14.256344
## recommend_LinMeaScore 23.776816
## Mortality 32.970988
## Safety 60.772804
## Readmission 70.600930
#####The variable importance plot shows which variables are most important for predicting hospital rates in a random forest model. Important variables include Readmission, Safety, and Mortality significantly impacting the model’s accuracy. Variables like Staff Response, Recommendations, and Nurse Communication also play important roles, while variables like Effectiveness and Timeliness have comparatively less impact.
# predicting hospital ratings on random forest model on the test data
test_data$rf_model_pred <- predict(rf_model, test_data, type="response")
# Ensuring and setting both actual and predicted variable levels to be same
test_data$hosp_rate <- as.factor(test_data$hosp_rate)
levels(test_data$hosp_rate) <- levels(test_data$rf_model_pred)
# checking the levels of both actual and predicted
levels(test_data$hosp_rate)
## [1] "0" "1"
levels(test_data$rf_model_pred)
## [1] "0" "1"
# confusion matrix for random forest model
conf_matrix_rf <- confusionMatrix(test_data$rf_model_pred, factor(test_data$hosp_rate))
conf_matrix_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 924 103
## 1 68 288
##
## Accuracy : 0.8764
## 95% CI : (0.8578, 0.8933)
## No Information Rate : 0.7173
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6866
##
## Mcnemar's Test P-Value : 0.009321
##
## Sensitivity : 0.9315
## Specificity : 0.7366
## Pos Pred Value : 0.8997
## Neg Pred Value : 0.8090
## Prevalence : 0.7173
## Detection Rate : 0.6681
## Detection Prevalence : 0.7426
## Balanced Accuracy : 0.8340
##
## 'Positive' Class : 0
##
####The confusion matrix assesses the model’s performance on new data, indicating an overall accuracy of 87.64%. Sensitivity (0.9315) reflects the model’s effectiveness in identifying high-rated hospitals, while specificity (0.7366) suggests a moderate success in distinguishing low-rated ones.
# Extracting error metrics from the confusion matrix
accuracy_rf <- conf_matrix_rf$overall['Accuracy']
precision_rf <- conf_matrix_rf$byClass['Precision']
recall_rf <- conf_matrix_rf$byClass['Recall']
f1_score_rf <- conf_matrix_rf$byClass['F1']
print(precision_log)
## Precision
## 0.9254032
print(recall_log)
## Recall
## 0.9080119
print(f1_score_log)
## F1
## 0.9166251
# validating random forset model performance using cross validation technique
library(caret)
set.seed(123)
ctrl <- trainControl(method = "cv", number = 10)
cv_rf_results <- train(as.factor(hosp_rate) ~ cleanliness_LinMeaScore + nurse_comm_LinMeaScore +
doc_comm_LinMeaScore + staff_response_LinMeaScore +
med_commu_LinMeaScore + discharge_info_LinMeaScore + care_trans_LinMeaScore +
quietness_LinMeaScore + recommend_LinMeaScore + EHR_criteria + Emergency.Services + Mortality + Effectiveness + Timeliness + Safety + Readmission +Efficient.imaging + Hospital.Ownership,
data = train_data, method = "rf", trControl = ctrl)
cv_rf_results
## Random Forest
##
## 3224 samples
## 18 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2901, 2902, 2902, 2902, 2902, 2901, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8591937 0.6053438
## 16 0.8985828 0.7347742
## 31 0.8973405 0.7329777
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 16.
####At each split, 16 features are used by the best Random Forest model, which was selected based on its maximum accuracy has kappa value of 0.73 and accuracy 89.86%. which means model is expected to be correct about 89.86% of the time, and the kappa value indicates a substantial level of agreement beyond chance.
#SVM model
# Training the SVM model
library(e1071)
svm_model <- svm(hosp_rate ~ care_trans_LinMeaScore + nurse_comm_LinMeaScore+
discharge_info_LinMeaScore + cleanliness_LinMeaScore + staff_response_LinMeaScore+ Effectiveness + Timeliness + Efficient.imaging + recommend_LinMeaScore + Mortality + Safety + Readmission ,
data = train_data,
kernel = "linear")
summary(svm_model)
##
## Call:
## svm(formula = hosp_rate ~ care_trans_LinMeaScore + nurse_comm_LinMeaScore +
## discharge_info_LinMeaScore + cleanliness_LinMeaScore + staff_response_LinMeaScore +
## Effectiveness + Timeliness + Efficient.imaging + recommend_LinMeaScore +
## Mortality + Safety + Readmission, data = train_data, kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 845
##
## ( 424 421 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
library(caret)
# Making predictions on the test set
test_data$svm_pred <- predict(svm_model, newdata = test_data)
test_data$hosp_rate = as.factor(test_data$hosp_rate)
test_data$svm_pred = as.factor(test_data$svm_pred)
# Creating a confusion matrix
conf_matrix_SVM <- confusionMatrix(test_data$svm_pred, test_data$hosp_rate)
print(conf_matrix_SVM)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 911 92
## 1 81 299
##
## Accuracy : 0.8749
## 95% CI : (0.8563, 0.8919)
## No Information Rate : 0.7173
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6889
##
## Mcnemar's Test P-Value : 0.4471
##
## Sensitivity : 0.9183
## Specificity : 0.7647
## Pos Pred Value : 0.9083
## Neg Pred Value : 0.7868
## Prevalence : 0.7173
## Detection Rate : 0.6587
## Detection Prevalence : 0.7252
## Balanced Accuracy : 0.8415
##
## 'Positive' Class : 0
##
####The SVM model’s confusion matrix shows that its overall accuracy is roughly 87.49%. At 91.83%, the sensitivity is strong, indicating that the model can accurately identify hospitals with a high rating . Specificity, which measures how well the model can identify hospitals with low ratings, is approximately 76.47%. These findings imply that, with a marginally increased emphasis on sensitivity, the SVM model is useful for selecting highly rated hospitals. This makes it a potentially useful tool for identifying high-rated hospitals, but its ability to identify low-rated ones may be only marginally successful.
# Extracting error metrics from the confusion matrix
accuracy_SVM <- conf_matrix_SVM$overall['Accuracy']
precision_SVM <- conf_matrix_SVM$byClass['Precision']
recall_SVM <- conf_matrix_SVM$byClass['Recall']
f1_score_SVM <- conf_matrix_SVM$byClass['F1']
print(precision_SVM)
## Precision
## 0.9082752
print(recall_SVM)
## Recall
## 0.9183468
print(f1_score_SVM)
## F1
## 0.9132832
#validating random forset model performance using cross validation technique
library(caret)
library(e1071)
# Setting seed for reproducibility
set.seed(123)
# Defining the SVM formula
formula_svm <- as.formula("hosp_rate ~ cleanliness_LinMeaScore + nurse_comm_LinMeaScore +
doc_comm_LinMeaScore + staff_response_LinMeaScore +
med_commu_LinMeaScore + discharge_info_LinMeaScore + care_trans_LinMeaScore +
quietness_LinMeaScore + recommend_LinMeaScore + EHR_criteria +
Emergency.Services + Mortality + Effectiveness + Timeliness + Safety + Readmission +
Efficient.imaging + Hospital.Ownership")
# Creating a training control with 10-fold cross-validation
ctrl <- trainControl(method = "cv", number = 10)
# Training the SVM model using cross-validation
cv_svm_results <- train(formula_svm, data = train_data, method = "svmRadial", trControl = ctrl)
# Printing the cross-validation results
print(cv_svm_results)
## Support Vector Machines with Radial Basis Function Kernel
##
## 3224 samples
## 18 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2901, 2902, 2902, 2902, 2902, 2901, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.8793522 0.6754268
## 0.50 0.8905198 0.7095017
## 1.00 0.8951734 0.7272283
##
## Tuning parameter 'sigma' was held constant at a value of 0.02332418
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.02332418 and C = 1.
####Using a 10-fold cross-validation with a Radial Basis Function (RBF) kernel, the SVM model achieved a kappa value of 0.73 and an accuracy of approximately 89.52% on the training set of data. For best results, the model’s parameters were selected, with a fixed radial basis function parameter (sigma) of 0.0233 and a cost parameter (C) of 1. These results imply that the SVM model is good at predicting hospital ratings while maintaining a reasonable degree of generalization.
# Creating a new trial dataset for prediction
new_trial_data <- data.frame(
cleanliness_LinMeaScore = 85,
nurse_comm_LinMeaScore = 83,
doc_comm_LinMeaScore = 87,
staff_response_LinMeaScore = 89,
med_commu_LinMeaScore = 86,
discharge_info_LinMeaScore = 88,
care_trans_LinMeaScore = 82,
quietness_LinMeaScore = 84,
recommend_LinMeaScore = 86,
Mortality = "Below the national average",
Effectiveness = "Below the national average",
Timeliness = "Below the national average",
Safety = "Below the national average",
Readmission = "Below the national average",
Efficient.imaging = "Below the national average",
EHR_criteria = "1",
Emergency.Services = "Yes",
Hospital.Ownership = "Government - Local"
)
# Predicting hospital rates for the new trial dataset using the SVM model
new_trial_data$predicted_hosp_rate <- predict(log_model, newdata = new_trial_data)
new_trial_data$predicted_hosp_rate <- as.factor(ifelse(new_trial_data$predicted_hosp_rate> 0.6, "1", "0"))
#test_data$hosp_rate <- as.factor(as.character(test_data$hosp_rate))